{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:BD2ZKOM6RSFRGJK3LVCTVYEPSS","short_pith_number":"pith:BD2ZKOM6","canonical_record":{"source":{"id":"2605.14153","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2026-05-13T22:08:05Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"79885c40726a85236f8ec755dc326bf5900da719bac236f53264610589f12284","abstract_canon_sha256":"28969730aba1c3e108ffcd139570ec2650f40e1aa9ce8c71c4ec184ad10d29bb"},"schema_version":"1.0"},"canonical_sha256":"08f595399e8c8b13255b5d453ae08f949fdba2de92723a4bd070ae4f6e8da031","source":{"kind":"arxiv","id":"2605.14153","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14153","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14153v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14153","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"BD2ZKOM6RSFR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BD2ZKOM6RSFRGJK3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BD2ZKOM6","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:BD2ZKOM6RSFRGJK3LVCTVYEPSS","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14153","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2026-05-13T22:08:05Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"79885c40726a85236f8ec755dc326bf5900da719bac236f53264610589f12284","abstract_canon_sha256":"28969730aba1c3e108ffcd139570ec2650f40e1aa9ce8c71c4ec184ad10d29bb"},"schema_version":"1.0"},"canonical_sha256":"08f595399e8c8b13255b5d453ae08f949fdba2de92723a4bd070ae4f6e8da031","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:11.563408Z","signature_b64":"bOAHvanQ4AJv6HAZP0NKGX0fRCRZXhz6vfL7w0aGvUWfYI1nKhkd5Ys90nXCG1UD3f39cJHaFtFMt3SYJfB3Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"08f595399e8c8b13255b5d453ae08f949fdba2de92723a4bd070ae4f6e8da031","last_reissued_at":"2026-05-17T23:39:11.562895Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:11.562895Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14153","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Nozarb9BZwx1KzrfwgCTSzKOShw23bVtasyUQOUfz9cdEoU/TU37ieWdjZf4msNAvlF59jQT3rIxyntCJuQEBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T17:46:53.507575Z"},"content_sha256":"5f2f41f929bc02c063eb7a34b079ea68dfc4d75cb20d0bca804d830491a1300a","schema_version":"1.0","event_id":"sha256:5f2f41f929bc02c063eb7a34b079ea68dfc4d75cb20d0bca804d830491a1300a"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:BD2ZKOM6RSFRGJK3LVCTVYEPSS","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"ExploitBench: A Capability Ladder Benchmark for LLM Cybersecurity Agents","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"LLM agents crash V8 targets but rarely reach arbitrary code execution","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"David Brumley, Seunghyun Lee","submitted_at":"2026-05-13T22:08:05Z","abstract_excerpt":"Exploitation is not a binary event. It is a ladder of acquiring progressive capabilities, from executing a single buggy line of code to taking full control of the target. However, existing LLM security benchmarks treat a crash as exploitation success. That single binary outcome collapses the hard parts of exploitation: the transition from triggering a bug to constructing reusable primitives and control.\n  We present ExploitBench, a capability-graded benchmark that decomposes exploitation into 16 measurable flags, from coverage and crash through sandbox primitives, arbitrary read/write, control"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"exploit construction against hardened targets is an emerging frontier capability.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 16 deterministic oracles correctly measure each capability level without false positives or negatives introduced by the randomized challenge-response protocol or differential execution checks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"ExploitBench decomposes LLM exploitation into 16 oracle-verified capability flags and finds public frontier models trigger crashes but rarely reach arbitrary code execution on 41 V8 bugs.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LLM agents crash V8 targets but rarely reach arbitrary code execution","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"68d7c16a6070922d7b2937b86b9608e44cd026458dbf4a0da277f96f2e2a159f"},"source":{"id":"2605.14153","kind":"arxiv","version":1},"verdict":{"id":"daf1345e-97d1-4074-9144-833eebe3adbb","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T04:57:42.092034Z","strongest_claim":"exploit construction against hardened targets is an emerging frontier capability.","one_line_summary":"ExploitBench decomposes LLM exploitation into 16 oracle-verified capability flags and finds public frontier models trigger crashes but rarely reach arbitrary code execution on 41 V8 bugs.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 16 deterministic oracles correctly measure each capability level without false positives or negatives introduced by the randomized challenge-response protocol or differential execution checks.","pith_extraction_headline":"LLM agents crash V8 targets but rarely reach arbitrary code execution"},"references":{"count":13,"sample":[{"doi":"","year":2024,"title":"Ctibench: A benchmark for evaluating llms in cyber threat intelligence","work_id":"20a296a2-b807-436d-9128-4943f1815728","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"v8CTF: an exploit VRP for the V8 JavaScript engine","work_id":"735d1ce1-9ddd-4e0d-8749-3fee90a3df55","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","ref_index":3,"cited_arxiv_id":"2310.06770","is_internal_anchor":true},{"doi":"","year":2025,"title":"SEC-bench: Automated Benchmarking of LLM Agents on Real-World Software Security Tasks","work_id":"f6c4d968-192f-43f1-8326-339d537e4b8a","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Arvo: Atlas of reproducible vulnerabilities for open source software","work_id":"f8a8872e-817a-430a-9b50-2fba3c8af21a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":13,"snapshot_sha256":"eaf04e0fbd56b9d746661296ac318c09a82345342b55d1e761b56bb07dbeaf1e","internal_anchors":1},"formal_canon":{"evidence_count":2,"snapshot_sha256":"4472ef6618a4b9e5525392eff2894cf4b5bb4cb8ae762c771a6177c505cc6f36"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"daf1345e-97d1-4074-9144-833eebe3adbb"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cmTSf0ATbA6hD5MKN1NWvjqapS/QG9WNQKfiV4zS8xIIdmyYlfb51Vz2SiCfog5N+ZJaQpKX/pMAaJi+5Q6hAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T17:46:53.508102Z"},"content_sha256":"f812e4341e74424d65b86b985571a9efcee548bf98d834c89628c8d9d0540fba","schema_version":"1.0","event_id":"sha256:f812e4341e74424d65b86b985571a9efcee548bf98d834c89628c8d9d0540fba"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS/bundle.json","state_url":"https://pith.science/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-29T17:46:53Z","links":{"resolver":"https://pith.science/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS","bundle":"https://pith.science/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS/bundle.json","state":"https://pith.science/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS/state.json","well_known_bundle":"https://pith.science/.well-known/pith/BD2ZKOM6RSFRGJK3LVCTVYEPSS/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:BD2ZKOM6RSFRGJK3LVCTVYEPSS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"28969730aba1c3e108ffcd139570ec2650f40e1aa9ce8c71c4ec184ad10d29bb","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2026-05-13T22:08:05Z","title_canon_sha256":"79885c40726a85236f8ec755dc326bf5900da719bac236f53264610589f12284"},"schema_version":"1.0","source":{"id":"2605.14153","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14153","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14153v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14153","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"BD2ZKOM6RSFR","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BD2ZKOM6RSFRGJK3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BD2ZKOM6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:f812e4341e74424d65b86b985571a9efcee548bf98d834c89628c8d9d0540fba","target":"graph","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"exploit construction against hardened targets is an emerging frontier capability."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 16 deterministic oracles correctly measure each capability level without false positives or negatives introduced by the randomized challenge-response protocol or differential execution checks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ExploitBench decomposes LLM exploitation into 16 oracle-verified capability flags and finds public frontier models trigger crashes but rarely reach arbitrary code execution on 41 V8 bugs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LLM agents crash V8 targets but rarely reach arbitrary code execution"}],"snapshot_sha256":"68d7c16a6070922d7b2937b86b9608e44cd026458dbf4a0da277f96f2e2a159f"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"4472ef6618a4b9e5525392eff2894cf4b5bb4cb8ae762c771a6177c505cc6f36"},"paper":{"abstract_excerpt":"Exploitation is not a binary event. It is a ladder of acquiring progressive capabilities, from executing a single buggy line of code to taking full control of the target. However, existing LLM security benchmarks treat a crash as exploitation success. That single binary outcome collapses the hard parts of exploitation: the transition from triggering a bug to constructing reusable primitives and control.\n  We present ExploitBench, a capability-graded benchmark that decomposes exploitation into 16 measurable flags, from coverage and crash through sandbox primitives, arbitrary read/write, control","authors_text":"David Brumley, Seunghyun Lee","cross_cats":["cs.AI"],"headline":"LLM agents crash V8 targets but rarely reach arbitrary code execution","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2026-05-13T22:08:05Z","title":"ExploitBench: A Capability Ladder Benchmark for LLM Cybersecurity Agents"},"references":{"count":13,"internal_anchors":1,"resolved_work":13,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Ctibench: A benchmark for evaluating llms in cyber threat intelligence","work_id":"20a296a2-b807-436d-9128-4943f1815728","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"v8CTF: an exploit VRP for the V8 JavaScript engine","work_id":"735d1ce1-9ddd-4e0d-8749-3fee90a3df55","year":2023},{"cited_arxiv_id":"2310.06770","doi":"","is_internal_anchor":true,"ref_index":3,"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"SEC-bench: Automated Benchmarking of LLM Agents on Real-World Software Security Tasks","work_id":"f6c4d968-192f-43f1-8326-339d537e4b8a","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Arvo: Atlas of reproducible vulnerabilities for open source software","work_id":"f8a8872e-817a-430a-9b50-2fba3c8af21a","year":2024}],"snapshot_sha256":"eaf04e0fbd56b9d746661296ac318c09a82345342b55d1e761b56bb07dbeaf1e"},"source":{"id":"2605.14153","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T04:57:42.092034Z","id":"daf1345e-97d1-4074-9144-833eebe3adbb","model_set":{"reader":"grok-4.3"},"one_line_summary":"ExploitBench decomposes LLM exploitation into 16 oracle-verified capability flags and finds public frontier models trigger crashes but rarely reach arbitrary code execution on 41 V8 bugs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LLM agents crash V8 targets but rarely reach arbitrary code execution","strongest_claim":"exploit construction against hardened targets is an emerging frontier capability.","weakest_assumption":"The 16 deterministic oracles correctly measure each capability level without false positives or negatives introduced by the randomized challenge-response protocol or differential execution checks."}},"verdict_id":"daf1345e-97d1-4074-9144-833eebe3adbb"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5f2f41f929bc02c063eb7a34b079ea68dfc4d75cb20d0bca804d830491a1300a","target":"record","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"28969730aba1c3e108ffcd139570ec2650f40e1aa9ce8c71c4ec184ad10d29bb","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2026-05-13T22:08:05Z","title_canon_sha256":"79885c40726a85236f8ec755dc326bf5900da719bac236f53264610589f12284"},"schema_version":"1.0","source":{"id":"2605.14153","kind":"arxiv","version":1}},"canonical_sha256":"08f595399e8c8b13255b5d453ae08f949fdba2de92723a4bd070ae4f6e8da031","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"08f595399e8c8b13255b5d453ae08f949fdba2de92723a4bd070ae4f6e8da031","first_computed_at":"2026-05-17T23:39:11.562895Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:11.562895Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"bOAHvanQ4AJv6HAZP0NKGX0fRCRZXhz6vfL7w0aGvUWfYI1nKhkd5Ys90nXCG1UD3f39cJHaFtFMt3SYJfB3Bw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:11.563408Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14153","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:5f2f41f929bc02c063eb7a34b079ea68dfc4d75cb20d0bca804d830491a1300a","sha256:f812e4341e74424d65b86b985571a9efcee548bf98d834c89628c8d9d0540fba"],"state_sha256":"a0215ca216644aef948484c109c10b64543c2b66a069abebe170452980636a7f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RD/JuB6DdLXPcI40CSL1f0cfW2bxdb6wE91vEHheNC9O0avk4Qd8waPceTc25W/7RQAqxu6BV+wWUM/ff4OYAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-29T17:46:53.510553Z","bundle_sha256":"de4a0b186f80353c818e9ae96cc97445a15ae9f100b1f12819e0661f3f360817"}}