{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:YOY6QLV6QQC7SP2TV3OYWEU3HF","short_pith_number":"pith:YOY6QLV6","canonical_record":{"source":{"id":"2407.04295","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2024-07-05T06:57:30Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"f74598f44eea1fd513b7b56c50447230bcf80aef8f4a84aca6c92583188144f7","abstract_canon_sha256":"badbb006037acd708228bc413519371479731de0270eb8b97df398aeee46af08"},"schema_version":"1.0"},"canonical_sha256":"c3b1e82ebe8405f93f53aedd8b129b39727bf5386f751e950e9ed0805c9ddf38","source":{"kind":"arxiv","id":"2407.04295","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.04295","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2407.04295v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.04295","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"YOY6QLV6QQC7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"YOY6QLV6QQC7SP2T","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"YOY6QLV6","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:YOY6QLV6QQC7SP2TV3OYWEU3HF","target":"record","payload":{"canonical_record":{"source":{"id":"2407.04295","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2024-07-05T06:57:30Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"f74598f44eea1fd513b7b56c50447230bcf80aef8f4a84aca6c92583188144f7","abstract_canon_sha256":"badbb006037acd708228bc413519371479731de0270eb8b97df398aeee46af08"},"schema_version":"1.0"},"canonical_sha256":"c3b1e82ebe8405f93f53aedd8b129b39727bf5386f751e950e9ed0805c9ddf38","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.831657Z","signature_b64":"UvmjJpecgSHWeO7n7Ks6zR6i8aGGYMrf8q7wOz1gqw8WmfGnHHbZWHxh/8BDKHMhJf4Sw+3HUx7iRuxvQIW7DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c3b1e82ebe8405f93f53aedd8b129b39727bf5386f751e950e9ed0805c9ddf38","last_reissued_at":"2026-05-17T23:38:53.831024Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.831024Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2407.04295","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Bij79jaXsxYheBTdasTt/HNRJT9m/l+dJvl1n1SUTLtdfFzzvW1e2eZ8HPMLmcZEC8AZ9RhlSoJumCqJlwLuAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T14:24:39.442115Z"},"content_sha256":"7474a3fd0acbfa657274a08dbd7fdfcfbea091f952589cf85c6a46724a6b2a00","schema_version":"1.0","event_id":"sha256:7474a3fd0acbfa657274a08dbd7fdfcfbea091f952589cf85c6a46724a6b2a00"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:YOY6QLV6QQC7SP2TV3OYWEU3HF","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A survey that creates taxonomies for jailbreak attacks and defenses on LLMs, subdivides them into sub-classes, and compares evaluation approaches.","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.CR","authors_text":"Jiaxing Song, Ke Xu, Qi Li, Sibo Yi, Tianshuo Cong, Xinlei He, Yule Liu, Zhen Sun","submitted_at":"2024-07-05T06:57:30Z","abstract_excerpt":"Large Language Models (LLMs) have performed exceptionally in various text-generative tasks, including question answering, translation, code completion, etc. However, the over-assistance of LLMs has raised the challenge of \"jailbreaking\", which induces the model to generate malicious responses against the usage policy and society by designing adversarial prompts. With the emergence of jailbreak attack methods exploiting different vulnerabilities in LLMs, the corresponding safety alignment measures are also evolving. In this paper, we propose a comprehensive and detailed taxonomy of jailbreak at"},"claims":{"count":3,"items":[{"kind":"strongest_claim","text":"we propose a comprehensive and detailed taxonomy of jailbreak attack and defense methods... and present a coherent diagram illustrating their relationships. We also conduct an investigation into the current evaluation methods and compare them from different perspectives.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the proposed taxonomy and sub-classifications accurately and comprehensively capture the current landscape of attacks and defenses without significant omissions or overlaps that would require revision.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A survey that creates taxonomies for jailbreak attacks and defenses on LLMs, subdivides them into sub-classes, and compares evaluation approaches.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"}],"snapshot_sha256":"d34a06f8c1f6f12add72bce2de4f60f731c7e90a3812ec18eb39373de0d0c09b"},"source":{"id":"2407.04295","kind":"arxiv","version":2},"verdict":{"id":"787dc28f-28ef-4bb1-b5c8-5291cea3dd14","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:17:07.003351Z","strongest_claim":"we propose a comprehensive and detailed taxonomy of jailbreak attack and defense methods... and present a coherent diagram illustrating their relationships. We also conduct an investigation into the current evaluation methods and compare them from different perspectives.","one_line_summary":"A survey that creates taxonomies for jailbreak attacks and defenses on LLMs, subdivides them into sub-classes, and compares evaluation approaches.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the proposed taxonomy and sub-classifications accurately and comprehensively capture the current landscape of attacks and defenses without significant omissions or overlaps that would require revision.","pith_extraction_headline":""},"references":{"count":126,"sample":[{"doi":"","year":2023,"title":"Detecting Language Model Attacks with Perplexity","work_id":"8fac4469-dd8b-4784-9ff6-13d2e74e57fb","ref_index":1,"cited_arxiv_id":"2308.14132","is_internal_anchor":true},{"doi":"","year":2024,"title":"Jailbreaking leading safety-aligned LLMs with simple adaptive attacks","work_id":"81e706c8-459e-40a1-a79d-bda6a104cd22","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","ref_index":3,"cited_arxiv_id":"2312.11805","is_internal_anchor":true},{"doi":"","year":null,"title":"Introducing claude","work_id":"9512a48a-3097-4413-8593-c26c1be54540","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Many-shot jailbreaking","work_id":"d6a9bf6b-c588-4df6-8b27-21aec1333bbc","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":126,"snapshot_sha256":"12020fa2ad3a56a39f281e42527ab5b98de164fa4fdc6f64c3e3f2e210e0e4c4","internal_anchors":21},"formal_canon":{"evidence_count":1,"snapshot_sha256":"77c1c15d3860e4871f809cdc45806a6487bba0396d7b97b30bbe55e2f8b10672"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"787dc28f-28ef-4bb1-b5c8-5291cea3dd14"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"24O/e24w9VNf00cee/oLnOsWF+YCSVQqCduuSr0/pkp5Js/zDOAJk+E6SW0J+xcEuVMimihgBcELI1jqJhQzDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T14:24:39.443034Z"},"content_sha256":"b66f3ce1ef406f3eba5979c3a9024d2d625a448c1d70888b17779b83309776d8","schema_version":"1.0","event_id":"sha256:b66f3ce1ef406f3eba5979c3a9024d2d625a448c1d70888b17779b83309776d8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF/bundle.json","state_url":"https://pith.science/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T14:24:39Z","links":{"resolver":"https://pith.science/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF","bundle":"https://pith.science/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF/bundle.json","state":"https://pith.science/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF/state.json","well_known_bundle":"https://pith.science/.well-known/pith/YOY6QLV6QQC7SP2TV3OYWEU3HF/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:YOY6QLV6QQC7SP2TV3OYWEU3HF","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"badbb006037acd708228bc413519371479731de0270eb8b97df398aeee46af08","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2024-07-05T06:57:30Z","title_canon_sha256":"f74598f44eea1fd513b7b56c50447230bcf80aef8f4a84aca6c92583188144f7"},"schema_version":"1.0","source":{"id":"2407.04295","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.04295","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2407.04295v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.04295","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"YOY6QLV6QQC7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"YOY6QLV6QQC7SP2T","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"YOY6QLV6","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b66f3ce1ef406f3eba5979c3a9024d2d625a448c1d70888b17779b83309776d8","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":3,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"we propose a comprehensive and detailed taxonomy of jailbreak attack and defense methods... and present a coherent diagram illustrating their relationships. We also conduct an investigation into the current evaluation methods and compare them from different perspectives."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the proposed taxonomy and sub-classifications accurately and comprehensively capture the current landscape of attacks and defenses without significant omissions or overlaps that would require revision."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A survey that creates taxonomies for jailbreak attacks and defenses on LLMs, subdivides them into sub-classes, and compares evaluation approaches."}],"snapshot_sha256":"d34a06f8c1f6f12add72bce2de4f60f731c7e90a3812ec18eb39373de0d0c09b"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"77c1c15d3860e4871f809cdc45806a6487bba0396d7b97b30bbe55e2f8b10672"},"paper":{"abstract_excerpt":"Large Language Models (LLMs) have performed exceptionally in various text-generative tasks, including question answering, translation, code completion, etc. However, the over-assistance of LLMs has raised the challenge of \"jailbreaking\", which induces the model to generate malicious responses against the usage policy and society by designing adversarial prompts. With the emergence of jailbreak attack methods exploiting different vulnerabilities in LLMs, the corresponding safety alignment measures are also evolving. In this paper, we propose a comprehensive and detailed taxonomy of jailbreak at","authors_text":"Jiaxing Song, Ke Xu, Qi Li, Sibo Yi, Tianshuo Cong, Xinlei He, Yule Liu, Zhen Sun","cross_cats":["cs.AI","cs.CL","cs.LG"],"headline":"A survey that creates taxonomies for jailbreak attacks and defenses on LLMs, subdivides them into sub-classes, and compares evaluation approaches.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2024-07-05T06:57:30Z","title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey"},"references":{"count":126,"internal_anchors":21,"resolved_work":126,"sample":[{"cited_arxiv_id":"2308.14132","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Detecting Language Model Attacks with Perplexity","work_id":"8fac4469-dd8b-4784-9ff6-13d2e74e57fb","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Jailbreaking leading safety-aligned LLMs with simple adaptive attacks","work_id":"81e706c8-459e-40a1-a79d-bda6a104cd22","year":2024},{"cited_arxiv_id":"2312.11805","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Introducing claude","work_id":"9512a48a-3097-4413-8593-c26c1be54540","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Many-shot jailbreaking","work_id":"d6a9bf6b-c588-4df6-8b27-21aec1333bbc","year":2024}],"snapshot_sha256":"12020fa2ad3a56a39f281e42527ab5b98de164fa4fdc6f64c3e3f2e210e0e4c4"},"source":{"id":"2407.04295","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T02:17:07.003351Z","id":"787dc28f-28ef-4bb1-b5c8-5291cea3dd14","model_set":{"reader":"grok-4.3"},"one_line_summary":"A survey that creates taxonomies for jailbreak attacks and defenses on LLMs, subdivides them into sub-classes, and compares evaluation approaches.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"","strongest_claim":"we propose a comprehensive and detailed taxonomy of jailbreak attack and defense methods... and present a coherent diagram illustrating their relationships. We also conduct an investigation into the current evaluation methods and compare them from different perspectives.","weakest_assumption":"That the proposed taxonomy and sub-classifications accurately and comprehensively capture the current landscape of attacks and defenses without significant omissions or overlaps that would require revision."}},"verdict_id":"787dc28f-28ef-4bb1-b5c8-5291cea3dd14"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7474a3fd0acbfa657274a08dbd7fdfcfbea091f952589cf85c6a46724a6b2a00","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"badbb006037acd708228bc413519371479731de0270eb8b97df398aeee46af08","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2024-07-05T06:57:30Z","title_canon_sha256":"f74598f44eea1fd513b7b56c50447230bcf80aef8f4a84aca6c92583188144f7"},"schema_version":"1.0","source":{"id":"2407.04295","kind":"arxiv","version":2}},"canonical_sha256":"c3b1e82ebe8405f93f53aedd8b129b39727bf5386f751e950e9ed0805c9ddf38","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c3b1e82ebe8405f93f53aedd8b129b39727bf5386f751e950e9ed0805c9ddf38","first_computed_at":"2026-05-17T23:38:53.831024Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.831024Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"UvmjJpecgSHWeO7n7Ks6zR6i8aGGYMrf8q7wOz1gqw8WmfGnHHbZWHxh/8BDKHMhJf4Sw+3HUx7iRuxvQIW7DQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.831657Z","signed_message":"canonical_sha256_bytes"},"source_id":"2407.04295","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7474a3fd0acbfa657274a08dbd7fdfcfbea091f952589cf85c6a46724a6b2a00","sha256:b66f3ce1ef406f3eba5979c3a9024d2d625a448c1d70888b17779b83309776d8"],"state_sha256":"ad6a4265e8021a035c4c0bc7362446ec7e0f5a1c31c1d280b3197c212cf735a8"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vUt6zIZPR73M2p/DZanGjiOYuFZ9AuZNj9CGCvcMEITIX54XbCqE7xBd/B5Dgi+SyZY+sSpQGJuPomFHj7rxBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T14:24:39.446677Z","bundle_sha256":"f9d0b6306c5c33b64cdecdf40d7d119fe236fe726ba402f519bcfc529afe61f6"}}