{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:JZFZLXG2TXCSLITVX6ODM37JTK","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"93bcf11c24b0507a9b7051e96f052555f8b412493cd04b4a50aa960416c6f9ce","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-06-26T14:51:16Z","title_canon_sha256":"03341ba9a35e8969cfb512f360ecd35e8f8856350d9608c2d681521706c97720"},"schema_version":"1.0","source":{"id":"2606.28153","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.28153","created_at":"2026-06-29T01:15:07Z"},{"alias_kind":"arxiv_version","alias_value":"2606.28153v1","created_at":"2026-06-29T01:15:07Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.28153","created_at":"2026-06-29T01:15:07Z"},{"alias_kind":"pith_short_12","alias_value":"JZFZLXG2TXCS","created_at":"2026-06-29T01:15:07Z"},{"alias_kind":"pith_short_16","alias_value":"JZFZLXG2TXCSLITV","created_at":"2026-06-29T01:15:07Z"},{"alias_kind":"pith_short_8","alias_value":"JZFZLXG2","created_at":"2026-06-29T01:15:07Z"}],"graph_snapshots":[{"event_id":"sha256:a0d701b2d0f53cbb89046dd61c3526b4822ddb2bdeeaa0d98b91fe73bf7b6e4f","target":"graph","created_at":"2026-06-29T01:15:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.28153/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Jailbreak attacks bypass LLM safety alignment, yet their mechanisms remain poorly understood. We provide evidence that attacks do not comprehensively eliminate safety features, but instead selectively suppress specific attention heads. We identify two functionally differentiated types: Adversarially Compromised Heads (ACHs) concentrated in early layers, which are suppressed under attacks, and Safety-Aligned Heads (SAHs) in mid-layers, which maintain robust activations even when attacks succeed. Ablation studies support the causal role of ACHs and the contribution of SAHs to robust activations:","authors_text":"Dongqi Han, Linghui Li, Yanchen Yin","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-06-26T14:51:16Z","title":"Robust Harmful Features Under Jailbreak Attacks: Mechanistic Evidence from Attention Head Specialization in Large Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.28153","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f2184af0f37d642f8b991e89ad702194c02c75e2d086e371bfab00c9cb2a5f9e","target":"record","created_at":"2026-06-29T01:15:07Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"93bcf11c24b0507a9b7051e96f052555f8b412493cd04b4a50aa960416c6f9ce","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-06-26T14:51:16Z","title_canon_sha256":"03341ba9a35e8969cfb512f360ecd35e8f8856350d9608c2d681521706c97720"},"schema_version":"1.0","source":{"id":"2606.28153","kind":"arxiv","version":1}},"canonical_sha256":"4e4b95dcda9dc525a275bf9c366fe99a9061477c38868c0d0484de73a76f6c5d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4e4b95dcda9dc525a275bf9c366fe99a9061477c38868c0d0484de73a76f6c5d","first_computed_at":"2026-06-29T01:15:07.586783Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-29T01:15:07.586783Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"HSwS3XS8ujNI15LS7FpsZnX4w5Y2gL7YrHX6AlWKdIfN7G+VB+H+YDm5Mud3SZ7J3D0IcNtaC/zaIClgo5suBA==","signature_status":"signed_v1","signed_at":"2026-06-29T01:15:07.587191Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.28153","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f2184af0f37d642f8b991e89ad702194c02c75e2d086e371bfab00c9cb2a5f9e","sha256:a0d701b2d0f53cbb89046dd61c3526b4822ddb2bdeeaa0d98b91fe73bf7b6e4f"],"state_sha256":"1d403fee8fc05cb1c06ba693a692b05704c98ec53d4818b30f1158cbf9773bfb"}