{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:PQSCZURRQSYBR5I56YTFM325O6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"705445e4a0882b24468b88e0d56f75d406be1010542225a2e711fbe7e30a8ec4","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2024-03-28T02:44:02Z","title_canon_sha256":"567337dfe199e91f48ff0e9b7da157d811d7b1d3e7f9d6d2aef3b5f19080f0e0"},"schema_version":"1.0","source":{"id":"2404.01318","kind":"arxiv","version":5}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.01318","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2404.01318v5","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.01318","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"PQSCZURRQSYB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"PQSCZURRQSYBR5I5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"PQSCZURR","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c1a951cdf41b97ced13dbee11bc073dbb2a5800f9f3b46df4f72ac21a826287e","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"To address these challenges, we introduce JailbreakBench, an open-sourced benchmark with the following components: (1) an evolving repository of state-of-the-art adversarial prompts, which we refer to as jailbreak artifacts; (2) a jailbreaking dataset comprising 100 behaviors; (3) a standardized evaluation framework; and (4) a leaderboard."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the selected 100 behaviors, threat model, system prompts, and scoring functions sufficiently capture real-world jailbreaking risks and success without introducing systematic bias in evaluation."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"JailbreakBench supplies an evolving set of jailbreak prompts, a 100-behavior dataset aligned with usage policies, a standardized evaluation framework, and a leaderboard to enable comparable assessments of attacks and defenses on LLMs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"JailbreakBench supplies an open repository of adversarial prompts, a 100-behavior dataset, a fixed evaluation framework, and a public leaderboard to make jailbreak comparisons reproducible across models."}],"snapshot_sha256":"f95ec46d9b24077809065c67c58de33ff5086f87d2977b6a87a9ca3241da3595"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f81ed52f054c6750d39a3678c52491d084c9abe2a74f2bf2953db603e09da597"},"paper":{"abstract_excerpt":"Jailbreak attacks cause large language models (LLMs) to generate harmful, unethical, or otherwise objectionable content. Evaluating these attacks presents a number of challenges, which the current collection of benchmarks and evaluation techniques do not adequately address. First, there is no clear standard of practice regarding jailbreaking evaluation. Second, existing works compute costs and success rates in incomparable ways. And third, numerous works are not reproducible, as they withhold adversarial prompts, involve closed-source code, or rely on evolving proprietary APIs. To address thes","authors_text":"Alexander Robey, Edgar Dobriban, Edoardo Debenedetti, Eric Wong, Florian Tramer, Francesco Croce, George J. Pappas, Hamed Hassani, Maksym Andriushchenko, Nicolas Flammarion, Patrick Chao, Vikash Sehwag","cross_cats":["cs.LG"],"headline":"JailbreakBench supplies an open repository of adversarial prompts, a 100-behavior dataset, a fixed evaluation framework, and a public leaderboard to make jailbreak comparisons reproducible across models.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2024-03-28T02:44:02Z","title":"JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models"},"references":{"count":64,"internal_anchors":19,"resolved_work":64,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Are you still on track!? catching llm task drift with activations","work_id":"f913aa64-dddb-4ce5-9f2c-5e314589aa1a","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Llama 3 model card","work_id":"f46e0736-9f8c-49ee-8a86-552ab9905bf6","year":2024},{"cited_arxiv_id":"","doi":"10.1145/3650203.3663326","is_internal_anchor":false,"ref_index":3,"title":"Croissant: A Metadata Format for ML-Ready Datasets","work_id":"b13e2013-4762-4e9a-97b5-74aa550ddbde","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Jailbreak chat","work_id":"5d243cb8-eac6-42fe-9c14-a649fb943b5e","year":2023},{"cited_arxiv_id":"2308.14132","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Detecting Language Model Attacks with Perplexity","work_id":"8fac4469-dd8b-4784-9ff6-13d2e74e57fb","year":2023}],"snapshot_sha256":"62a36ba2a5522e98301885f59ce4d838a44ce5e875388b9d08d1c828ce9c1efb"},"source":{"id":"2404.01318","kind":"arxiv","version":5},"verdict":{"created_at":"2026-05-15T06:04:40.711130Z","id":"ad8cda48-d057-48a1-8810-a2345dd287a0","model_set":{"reader":"grok-4.3"},"one_line_summary":"JailbreakBench supplies an evolving set of jailbreak prompts, a 100-behavior dataset aligned with usage policies, a standardized evaluation framework, and a leaderboard to enable comparable assessments of attacks and defenses on LLMs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"JailbreakBench supplies an open repository of adversarial prompts, a 100-behavior dataset, a fixed evaluation framework, and a public leaderboard to make jailbreak comparisons reproducible across models.","strongest_claim":"To address these challenges, we introduce JailbreakBench, an open-sourced benchmark with the following components: (1) an evolving repository of state-of-the-art adversarial prompts, which we refer to as jailbreak artifacts; (2) a jailbreaking dataset comprising 100 behaviors; (3) a standardized evaluation framework; and (4) a leaderboard.","weakest_assumption":"That the selected 100 behaviors, threat model, system prompts, and scoring functions sufficiently capture real-world jailbreaking risks and success without introducing systematic bias in evaluation."}},"verdict_id":"ad8cda48-d057-48a1-8810-a2345dd287a0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:194cc24e26872e79f1a61bc3c1a35eaabd1d1222d0662886f80301c012b9d6fa","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"705445e4a0882b24468b88e0d56f75d406be1010542225a2e711fbe7e30a8ec4","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2024-03-28T02:44:02Z","title_canon_sha256":"567337dfe199e91f48ff0e9b7da157d811d7b1d3e7f9d6d2aef3b5f19080f0e0"},"schema_version":"1.0","source":{"id":"2404.01318","kind":"arxiv","version":5}},"canonical_sha256":"7c242cd23184b018f51df626566f5d77a643f2c1653587b310e29909b38bfe48","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"7c242cd23184b018f51df626566f5d77a643f2c1653587b310e29909b38bfe48","first_computed_at":"2026-05-17T23:38:53.302991Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.302991Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wqeugG9DiNN331Et7SFx5dMG2s0j87DmDtYCFVKxg/309m/dCGmKCQBNdVHmyqVJIs4tFxgtGj5Du0/25GysCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.303636Z","signed_message":"canonical_sha256_bytes"},"source_id":"2404.01318","source_kind":"arxiv","source_version":5}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:194cc24e26872e79f1a61bc3c1a35eaabd1d1222d0662886f80301c012b9d6fa","sha256:c1a951cdf41b97ced13dbee11bc073dbb2a5800f9f3b46df4f72ac21a826287e"],"state_sha256":"e9f2bc35ec1f0a5ea3de3558c9ce5f6342328438506e871391bdbe031bbde180"}