{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Z3LQJZNNZFRRINPYI4DDMM553W","short_pith_number":"pith:Z3LQJZNN","schema_version":"1.0","canonical_sha256":"ced704e5adc9631435f847063633bdddbc74fd1ff5e2aeffb38640ca2b1ded06","source":{"kind":"arxiv","id":"2605.17971","version":1},"attestation_state":"computed","paper":{"title":"Babel: Jailbreaking Safety Attention via Obfuscation Distribution Optimized Sampling","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Cong Wu, Jing Chen, Ju Jia, Ruichao Liang, Ruiying Du, Yang Liu, Yebo Feng, Zhi Wang, Ziwei Wang","submitted_at":"2026-05-18T07:27:59Z","abstract_excerpt":"Despite rigorous safety alignment, Large Language Models (LLMs) remain vulnerable to jailbreak attacks. Existing black-box methods often rely on heuristic templates or exhaustive trials, lacking mechanistic interpretability and query efficiency. In this study, we investigate an intrinsic vulnerability in the safety mechanisms of LLMs, where safety alignment relies on a small set of sparsely distributed attention heads, leaving much of the representational space weakly monitored. We formalize this phenomenon with a mathematical jailbreaking model that characterizes the delicate boundary of effe"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.17971","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-05-18T07:27:59Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"72849228458a730be09f1350f0a280951a59f1aaa1ffdd287ce1355b1a18d581","abstract_canon_sha256":"26cccc19e5fc334a0027ca29deacecf82c166083fd00b8ef99d6756f9d49bd7a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:05:08.807166Z","signature_b64":"ce//l4cXK8EDaOrbqYxaGIFW9JX/T9LyVq9KH1PuaHVxbsuV+k9a7XBfCtcUb3UGTfV01KB0mrL21xDpQCR1Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ced704e5adc9631435f847063633bdddbc74fd1ff5e2aeffb38640ca2b1ded06","last_reissued_at":"2026-05-20T00:05:08.806265Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:05:08.806265Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Babel: Jailbreaking Safety Attention via Obfuscation Distribution Optimized Sampling","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Cong Wu, Jing Chen, Ju Jia, Ruichao Liang, Ruiying Du, Yang Liu, Yebo Feng, Zhi Wang, Ziwei Wang","submitted_at":"2026-05-18T07:27:59Z","abstract_excerpt":"Despite rigorous safety alignment, Large Language Models (LLMs) remain vulnerable to jailbreak attacks. Existing black-box methods often rely on heuristic templates or exhaustive trials, lacking mechanistic interpretability and query efficiency. In this study, we investigate an intrinsic vulnerability in the safety mechanisms of LLMs, where safety alignment relies on a small set of sparsely distributed attention heads, leaving much of the representational space weakly monitored. We formalize this phenomenon with a mathematical jailbreaking model that characterizes the delicate boundary of effe"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.17971","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.17971/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-19T23:33:35.575151Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"98b4aa48d5f7373c049878c6c48a8e011306ead6dbe4845429fe688b4d482d6e"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.17971","created_at":"2026-05-20T00:05:08.806433+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.17971v1","created_at":"2026-05-20T00:05:08.806433+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.17971","created_at":"2026-05-20T00:05:08.806433+00:00"},{"alias_kind":"pith_short_12","alias_value":"Z3LQJZNNZFRR","created_at":"2026-05-20T00:05:08.806433+00:00"},{"alias_kind":"pith_short_16","alias_value":"Z3LQJZNNZFRRINPY","created_at":"2026-05-20T00:05:08.806433+00:00"},{"alias_kind":"pith_short_8","alias_value":"Z3LQJZNN","created_at":"2026-05-20T00:05:08.806433+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W","json":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W.json","graph_json":"https://pith.science/api/pith-number/Z3LQJZNNZFRRINPYI4DDMM553W/graph.json","events_json":"https://pith.science/api/pith-number/Z3LQJZNNZFRRINPYI4DDMM553W/events.json","paper":"https://pith.science/paper/Z3LQJZNN"},"agent_actions":{"view_html":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W","download_json":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W.json","view_paper":"https://pith.science/paper/Z3LQJZNN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.17971&json=true","fetch_graph":"https://pith.science/api/pith-number/Z3LQJZNNZFRRINPYI4DDMM553W/graph.json","fetch_events":"https://pith.science/api/pith-number/Z3LQJZNNZFRRINPYI4DDMM553W/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W/action/storage_attestation","attest_author":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W/action/author_attestation","sign_citation":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W/action/citation_signature","submit_replication":"https://pith.science/pith/Z3LQJZNNZFRRINPYI4DDMM553W/action/replication_record"}},"created_at":"2026-05-20T00:05:08.806433+00:00","updated_at":"2026-05-20T00:05:08.806433+00:00"}