{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:ML26MS6YSFNLUE5P7T6VQWADN6","short_pith_number":"pith:ML26MS6Y","schema_version":"1.0","canonical_sha256":"62f5e64bd8915aba13affcfd5858036fb2781bae2c165f578df38b73151f209c","source":{"kind":"arxiv","id":"2512.21132","version":2},"attestation_state":"computed","paper":{"title":"AutoBaxBuilder: Bootstrapping Code Security Benchmarking","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.PL"],"primary_cat":"cs.CR","authors_text":"Mark Vero, Martin Vechev, Maximilian Baader, Niels M\\\"undler, Tobias von Arx","submitted_at":"2025-12-24T12:02:00Z","abstract_excerpt":"As large language models (LLMs) see wide adoption in software engineering, the reliable assessment of the correctness and security of LLM-generated code is crucial. Notably, prior work showed that LLMs are prone to generating code with security vulnerabilities, highlighting that security is often overlooked. These insights were enabled by specialized benchmarks crafted by security experts through significant manual effort. However, benchmarks (i) inevitably end up contaminating training data, (ii) must extend to new tasks to provide a more complete picture, and (iii) must increase in difficult"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.21132","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2025-12-24T12:02:00Z","cross_cats_sorted":["cs.AI","cs.LG","cs.PL"],"title_canon_sha256":"17942f11ad36b90ec074befd3be6b6fedf1194bb75e8258ff66155377ca7f5f1","abstract_canon_sha256":"0b6f6623a12a83f687617fc68bfe8c59898a021b406fbf0d6e26ecae69b8283a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:03:52.876679Z","signature_b64":"DcSok4pJFjy+w2DvktZbR9ZGEe8b7DOGz58GsNwk0Efssut75bAfAb+r+Bj2bp2oRAmwxlSIe0I9XXI5ZmzjAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"62f5e64bd8915aba13affcfd5858036fb2781bae2c165f578df38b73151f209c","last_reissued_at":"2026-05-22T01:03:52.875568Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:03:52.875568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"AutoBaxBuilder: Bootstrapping Code Security Benchmarking","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.PL"],"primary_cat":"cs.CR","authors_text":"Mark Vero, Martin Vechev, Maximilian Baader, Niels M\\\"undler, Tobias von Arx","submitted_at":"2025-12-24T12:02:00Z","abstract_excerpt":"As large language models (LLMs) see wide adoption in software engineering, the reliable assessment of the correctness and security of LLM-generated code is crucial. Notably, prior work showed that LLMs are prone to generating code with security vulnerabilities, highlighting that security is often overlooked. These insights were enabled by specialized benchmarks crafted by security experts through significant manual effort. However, benchmarks (i) inevitably end up contaminating training data, (ii) must extend to new tasks to provide a more complete picture, and (iii) must increase in difficult"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.21132","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.21132/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.21132","created_at":"2026-05-22T01:03:52.875709+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.21132v2","created_at":"2026-05-22T01:03:52.875709+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.21132","created_at":"2026-05-22T01:03:52.875709+00:00"},{"alias_kind":"pith_short_12","alias_value":"ML26MS6YSFNL","created_at":"2026-05-22T01:03:52.875709+00:00"},{"alias_kind":"pith_short_16","alias_value":"ML26MS6YSFNLUE5P","created_at":"2026-05-22T01:03:52.875709+00:00"},{"alias_kind":"pith_short_8","alias_value":"ML26MS6Y","created_at":"2026-05-22T01:03:52.875709+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6","json":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6.json","graph_json":"https://pith.science/api/pith-number/ML26MS6YSFNLUE5P7T6VQWADN6/graph.json","events_json":"https://pith.science/api/pith-number/ML26MS6YSFNLUE5P7T6VQWADN6/events.json","paper":"https://pith.science/paper/ML26MS6Y"},"agent_actions":{"view_html":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6","download_json":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6.json","view_paper":"https://pith.science/paper/ML26MS6Y","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.21132&json=true","fetch_graph":"https://pith.science/api/pith-number/ML26MS6YSFNLUE5P7T6VQWADN6/graph.json","fetch_events":"https://pith.science/api/pith-number/ML26MS6YSFNLUE5P7T6VQWADN6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6/action/storage_attestation","attest_author":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6/action/author_attestation","sign_citation":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6/action/citation_signature","submit_replication":"https://pith.science/pith/ML26MS6YSFNLUE5P7T6VQWADN6/action/replication_record"}},"created_at":"2026-05-22T01:03:52.875709+00:00","updated_at":"2026-05-22T01:03:52.875709+00:00"}