{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:2QNPOEXBT32MN3CU2BQGPK2ODQ","short_pith_number":"pith:2QNPOEXB","canonical_record":{"source":{"id":"2307.02483","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-07-05T17:58:10Z","cross_cats_sorted":["cs.CR"],"title_canon_sha256":"06a4c5cfa0a7071f7adaa162e34a152c9ce3ea66272a4ef67029d66727e731b9","abstract_canon_sha256":"35fece2e2de829232a6fded217ebcceccab27ca999683e6184958681c57c0488"},"schema_version":"1.0"},"canonical_sha256":"d41af712e19ef4c6ec54d06067ab4e1c1eefc4cd022ecef9687e1b5f069cab78","source":{"kind":"arxiv","id":"2307.02483","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.02483","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"arxiv_version","alias_value":"2307.02483v1","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.02483","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"pith_short_12","alias_value":"2QNPOEXBT32M","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2QNPOEXBT32MN3CU","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2QNPOEXB","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:2QNPOEXBT32MN3CU2BQGPK2ODQ","target":"record","payload":{"canonical_record":{"source":{"id":"2307.02483","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-07-05T17:58:10Z","cross_cats_sorted":["cs.CR"],"title_canon_sha256":"06a4c5cfa0a7071f7adaa162e34a152c9ce3ea66272a4ef67029d66727e731b9","abstract_canon_sha256":"35fece2e2de829232a6fded217ebcceccab27ca999683e6184958681c57c0488"},"schema_version":"1.0"},"canonical_sha256":"d41af712e19ef4c6ec54d06067ab4e1c1eefc4cd022ecef9687e1b5f069cab78","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:22.220688Z","signature_b64":"odq+sxi4Vy1PFIQDc1XaiMyZvJySWatn68nZo1J1ZFUZ3VeWb9IbB+4WjFrDBWJ5/c0i72uHZuKzWWIHGz4pAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d41af712e19ef4c6ec54d06067ab4e1c1eefc4cd022ecef9687e1b5f069cab78","last_reissued_at":"2026-05-17T23:39:22.220041Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:22.220041Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2307.02483","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:22Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"qGRoFgIqvphmVXYNbMsJM0hm+rWct3cBlROp7N8m6RXNZUBPDhS7ldzs6MbrYc0+RhQlUZiiny+YJ06wsJDEBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T05:52:16.742112Z"},"content_sha256":"014316bd20cb9abc0072c8229584c32ec4217778f9339b7c2779b0e45f4d4abb","schema_version":"1.0","event_id":"sha256:014316bd20cb9abc0072c8229584c32ec4217778f9339b7c2779b0e45f4d4abb"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:2QNPOEXBT32MN3CU2BQGPK2ODQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Jailbroken: How Does LLM Safety Training Fail?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"LLM safety training fails because of competing objectives and mismatched generalization, enabling jailbreaks that work on every red-teaming prompt for GPT-4 and Claude.","cross_cats":["cs.CR"],"primary_cat":"cs.LG","authors_text":"Alexander Wei, Jacob Steinhardt, Nika Haghtalab","submitted_at":"2023-07-05T17:58:10Z","abstract_excerpt":"Large language models trained for safety and harmlessness remain susceptible to adversarial misuse, as evidenced by the prevalence of \"jailbreak\" attacks on early releases of ChatGPT that elicit undesired behavior. Going beyond recognition of the issue, we investigate why such attacks succeed and how they can be created. We hypothesize two failure modes of safety training: competing objectives and mismatched generalization. Competing objectives arise when a model's capabilities and safety goals conflict, while mismatched generalization occurs when safety training fails to generalize to a domai"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"new attacks utilizing our failure modes succeed on every prompt in a collection of unsafe requests from the models' red-teaming evaluation sets and outperform existing ad hoc jailbreaks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the hypothesized failure modes are the main reasons jailbreaks work and that success on the specific red-teaming prompt collection indicates broad, persistent vulnerabilities in the evaluated models.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LLM safety training fails due to competing objectives and mismatched generalization, enabling new jailbreaks that succeed on all unsafe prompts from red-teaming sets in GPT-4 and Claude.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LLM safety training fails because of competing objectives and mismatched generalization, enabling jailbreaks that work on every red-teaming prompt for GPT-4 and Claude.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"773a07467929e9a87b4fe595fa2c5e1134bf6e33612f1895ac1d4eaf9932355c"},"source":{"id":"2307.02483","kind":"arxiv","version":1},"verdict":{"id":"cb2c9fd1-9334-4f9b-b097-d9ab9f60784f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T18:09:24.326551Z","strongest_claim":"new attacks utilizing our failure modes succeed on every prompt in a collection of unsafe requests from the models' red-teaming evaluation sets and outperform existing ad hoc jailbreaks.","one_line_summary":"LLM safety training fails due to competing objectives and mismatched generalization, enabling new jailbreaks that succeed on all unsafe prompts from red-teaming sets in GPT-4 and Claude.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the hypothesized failure modes are the main reasons jailbreaks work and that success on the specific red-teaming prompt collection indicates broad, persistent vulnerabilities in the evaluated models.","pith_extraction_headline":"LLM safety training fails because of competing objectives and mismatched generalization, enabling jailbreaks that work on every red-teaming prompt for GPT-4 and Claude."},"references":{"count":82,"sample":[{"doi":"","year":2023,"title":"Universal LLM jailbreak: ChatGPT, GPT-4, Bard, Bing, Anthropic, and beyond","work_id":"49d316b0-f00b-43e0-aed2-09a4a6a01ffa","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Jailbreak Chat","work_id":"ea956758-558b-46ce-9291-92d61b228a6f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Jailbreak Chat","work_id":"56ac6e0b-531d-42d0-a091-9179eabcdd8d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Anthropic API reference","work_id":"3b405a02-f96e-4559-8d30-57cb6ea8a201","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"We are offering a new version of our model, Claude-v1.3, that is safer and less susceptible to adversarial attacks","work_id":"7c25a90d-705b-47cc-86c6-e5953fef4dab","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":82,"snapshot_sha256":"03be77db24a8b058e4d1fd6dcbe8d7bb1b88c46291311f9c91e37187b01a4a7a","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"531c4c27824fc5389283d6efa8f4916a6874e21ea5f524503e71ea8112bdbc91"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"cb2c9fd1-9334-4f9b-b097-d9ab9f60784f"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:22Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/E8OM+CHuCR48c8JoN2xnF/CzSvzMer2EBMrVk3pGpMbeyWvHucXYsprcQfa4cVV/4IsRf3A7eQ95QU4wKoDAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T05:52:16.743037Z"},"content_sha256":"6aa0431acbc383afe02b1d61e994310c85654f0743b4bf925a5dddb5afc66b2e","schema_version":"1.0","event_id":"sha256:6aa0431acbc383afe02b1d61e994310c85654f0743b4bf925a5dddb5afc66b2e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ/bundle.json","state_url":"https://pith.science/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T05:52:16Z","links":{"resolver":"https://pith.science/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ","bundle":"https://pith.science/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ/bundle.json","state":"https://pith.science/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/2QNPOEXBT32MN3CU2BQGPK2ODQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:2QNPOEXBT32MN3CU2BQGPK2ODQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"35fece2e2de829232a6fded217ebcceccab27ca999683e6184958681c57c0488","cross_cats_sorted":["cs.CR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-07-05T17:58:10Z","title_canon_sha256":"06a4c5cfa0a7071f7adaa162e34a152c9ce3ea66272a4ef67029d66727e731b9"},"schema_version":"1.0","source":{"id":"2307.02483","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.02483","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"arxiv_version","alias_value":"2307.02483v1","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.02483","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"pith_short_12","alias_value":"2QNPOEXBT32M","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2QNPOEXBT32MN3CU","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2QNPOEXB","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:6aa0431acbc383afe02b1d61e994310c85654f0743b4bf925a5dddb5afc66b2e","target":"graph","created_at":"2026-05-17T23:39:22Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"new attacks utilizing our failure modes succeed on every prompt in a collection of unsafe requests from the models' red-teaming evaluation sets and outperform existing ad hoc jailbreaks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the hypothesized failure modes are the main reasons jailbreaks work and that success on the specific red-teaming prompt collection indicates broad, persistent vulnerabilities in the evaluated models."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LLM safety training fails due to competing objectives and mismatched generalization, enabling new jailbreaks that succeed on all unsafe prompts from red-teaming sets in GPT-4 and Claude."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LLM safety training fails because of competing objectives and mismatched generalization, enabling jailbreaks that work on every red-teaming prompt for GPT-4 and Claude."}],"snapshot_sha256":"773a07467929e9a87b4fe595fa2c5e1134bf6e33612f1895ac1d4eaf9932355c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"531c4c27824fc5389283d6efa8f4916a6874e21ea5f524503e71ea8112bdbc91"},"paper":{"abstract_excerpt":"Large language models trained for safety and harmlessness remain susceptible to adversarial misuse, as evidenced by the prevalence of \"jailbreak\" attacks on early releases of ChatGPT that elicit undesired behavior. Going beyond recognition of the issue, we investigate why such attacks succeed and how they can be created. We hypothesize two failure modes of safety training: competing objectives and mismatched generalization. Competing objectives arise when a model's capabilities and safety goals conflict, while mismatched generalization occurs when safety training fails to generalize to a domai","authors_text":"Alexander Wei, Jacob Steinhardt, Nika Haghtalab","cross_cats":["cs.CR"],"headline":"LLM safety training fails because of competing objectives and mismatched generalization, enabling jailbreaks that work on every red-teaming prompt for GPT-4 and Claude.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-07-05T17:58:10Z","title":"Jailbroken: How Does LLM Safety Training Fail?"},"references":{"count":82,"internal_anchors":9,"resolved_work":82,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Universal LLM jailbreak: ChatGPT, GPT-4, Bard, Bing, Anthropic, and beyond","work_id":"49d316b0-f00b-43e0-aed2-09a4a6a01ffa","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Jailbreak Chat","work_id":"ea956758-558b-46ce-9291-92d61b228a6f","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Jailbreak Chat","work_id":"56ac6e0b-531d-42d0-a091-9179eabcdd8d","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Anthropic API reference","work_id":"3b405a02-f96e-4559-8d30-57cb6ea8a201","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"We are offering a new version of our model, Claude-v1.3, that is safer and less susceptible to adversarial attacks","work_id":"7c25a90d-705b-47cc-86c6-e5953fef4dab","year":2023}],"snapshot_sha256":"03be77db24a8b058e4d1fd6dcbe8d7bb1b88c46291311f9c91e37187b01a4a7a"},"source":{"id":"2307.02483","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T18:09:24.326551Z","id":"cb2c9fd1-9334-4f9b-b097-d9ab9f60784f","model_set":{"reader":"grok-4.3"},"one_line_summary":"LLM safety training fails due to competing objectives and mismatched generalization, enabling new jailbreaks that succeed on all unsafe prompts from red-teaming sets in GPT-4 and Claude.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LLM safety training fails because of competing objectives and mismatched generalization, enabling jailbreaks that work on every red-teaming prompt for GPT-4 and Claude.","strongest_claim":"new attacks utilizing our failure modes succeed on every prompt in a collection of unsafe requests from the models' red-teaming evaluation sets and outperform existing ad hoc jailbreaks.","weakest_assumption":"That the hypothesized failure modes are the main reasons jailbreaks work and that success on the specific red-teaming prompt collection indicates broad, persistent vulnerabilities in the evaluated models."}},"verdict_id":"cb2c9fd1-9334-4f9b-b097-d9ab9f60784f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:014316bd20cb9abc0072c8229584c32ec4217778f9339b7c2779b0e45f4d4abb","target":"record","created_at":"2026-05-17T23:39:22Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"35fece2e2de829232a6fded217ebcceccab27ca999683e6184958681c57c0488","cross_cats_sorted":["cs.CR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-07-05T17:58:10Z","title_canon_sha256":"06a4c5cfa0a7071f7adaa162e34a152c9ce3ea66272a4ef67029d66727e731b9"},"schema_version":"1.0","source":{"id":"2307.02483","kind":"arxiv","version":1}},"canonical_sha256":"d41af712e19ef4c6ec54d06067ab4e1c1eefc4cd022ecef9687e1b5f069cab78","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d41af712e19ef4c6ec54d06067ab4e1c1eefc4cd022ecef9687e1b5f069cab78","first_computed_at":"2026-05-17T23:39:22.220041Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:22.220041Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"odq+sxi4Vy1PFIQDc1XaiMyZvJySWatn68nZo1J1ZFUZ3VeWb9IbB+4WjFrDBWJ5/c0i72uHZuKzWWIHGz4pAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:22.220688Z","signed_message":"canonical_sha256_bytes"},"source_id":"2307.02483","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:014316bd20cb9abc0072c8229584c32ec4217778f9339b7c2779b0e45f4d4abb","sha256:6aa0431acbc383afe02b1d61e994310c85654f0743b4bf925a5dddb5afc66b2e"],"state_sha256":"7fd0cfe36a016ab3813c125550966c726c56a57dd21f6fc58f2ac27b32aa9536"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VstV223RwgUygWolQNlkPaZujyS+baXVo0dJCishmGdThqwS0l381IYxjFKSTEsTmh7XZMU+ApsXcrIgX0QxCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T05:52:16.747329Z","bundle_sha256":"45f3f4d89e18670cc055e1feefb411e2a79479c75feeaa3cca2da0e7b3304106"}}