{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:5V7E47YJP32E3QBYTHDCOH3MWO","short_pith_number":"pith:5V7E47YJ","canonical_record":{"source":{"id":"2605.28553","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-27T14:44:36Z","cross_cats_sorted":["cs.CR"],"title_canon_sha256":"7ecf64a86809945fb1c6b3c7e9aa46028b728410b785e1aa820fc75e1c66a0d2","abstract_canon_sha256":"059902c7a677407be77c299634a3a44300350a0544c0d47d34b7c0da96e19886"},"schema_version":"1.0"},"canonical_sha256":"ed7e4e7f097ef44dc03899c6271f6cb3992e0ae22bc5212b4ae9ec3316e57b02","source":{"kind":"arxiv","id":"2605.28553","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.28553","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"arxiv_version","alias_value":"2605.28553v1","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.28553","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"pith_short_12","alias_value":"5V7E47YJP32E","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"pith_short_16","alias_value":"5V7E47YJP32E3QBY","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"pith_short_8","alias_value":"5V7E47YJ","created_at":"2026-05-28T02:04:56Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:5V7E47YJP32E3QBYTHDCOH3MWO","target":"record","payload":{"canonical_record":{"source":{"id":"2605.28553","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-27T14:44:36Z","cross_cats_sorted":["cs.CR"],"title_canon_sha256":"7ecf64a86809945fb1c6b3c7e9aa46028b728410b785e1aa820fc75e1c66a0d2","abstract_canon_sha256":"059902c7a677407be77c299634a3a44300350a0544c0d47d34b7c0da96e19886"},"schema_version":"1.0"},"canonical_sha256":"ed7e4e7f097ef44dc03899c6271f6cb3992e0ae22bc5212b4ae9ec3316e57b02","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T02:04:56.203047Z","signature_b64":"hKu80T02mDMpLy2ZPLv2FK223r+F/j1Zq+Vqtzj3M2zU7fV9K4njV5n/k0ckiJPq8iGp/GamHJRNDGOSmhwmDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ed7e4e7f097ef44dc03899c6271f6cb3992e0ae22bc5212b4ae9ec3316e57b02","last_reissued_at":"2026-05-28T02:04:56.202569Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T02:04:56.202569Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.28553","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T02:04:56Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nIPp45DyJZxG3IHzCpe/0oPxSaSiqTZb4LlMts90uFPz8OtJ5YUVxXoSvLMPyuyvHDKNxrdTIiU2SfkpW6KOBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T06:20:03.915196Z"},"content_sha256":"1f052699220f8608b7f512643254044bdcf76804a750405416a52400ed805c7c","schema_version":"1.0","event_id":"sha256:1f052699220f8608b7f512643254044bdcf76804a750405416a52400ed805c7c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:5V7E47YJP32E3QBYTHDCOH3MWO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Refusal Before Decoding: Detecting and Exploiting Refusal Signals in Intermediate LLM Activations","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CR"],"primary_cat":"cs.AI","authors_text":"Alberto Giaretta, Denis Kleyko, Matteo Gioele Collu, Matteo Zavatteri, Mauro Conti, Riccardo Conte, Roberto Confalonieri","submitted_at":"2026-05-27T14:44:36Z","abstract_excerpt":"In this paper, we investigate whether refusal behavior can be predicted from LLM intermediate activations before decoding using linear probes trained on residual stream activations at each transformer block. We find that refusal is linearly decodable well before the final layer, indicating that safety-relevant behavior is represented in intermediate activations before output generation. To test whether this signal is actionable, we introduce Mechanistic AutoDAN, a probe-guided variant of AutoDAN that replaces full-model fitness evaluation with partial forward passes and probe-based scoring ins"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.28553","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.28553/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T02:04:56Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"5mBWm/NqtyNdoHE3otgCrU6j32a6WrvyrYgeSLoxmTcNdlji83+O2kClrTRlXC5heEM3K0OdYsCV6pEbgQTdDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T06:20:03.915988Z"},"content_sha256":"09cffe9aefbad2ade18e97d77c0dadf9a8c7147d8da1932a2be0bf821ccf3a60","schema_version":"1.0","event_id":"sha256:09cffe9aefbad2ade18e97d77c0dadf9a8c7147d8da1932a2be0bf821ccf3a60"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/5V7E47YJP32E3QBYTHDCOH3MWO/bundle.json","state_url":"https://pith.science/pith/5V7E47YJP32E3QBYTHDCOH3MWO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/5V7E47YJP32E3QBYTHDCOH3MWO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-31T06:20:03Z","links":{"resolver":"https://pith.science/pith/5V7E47YJP32E3QBYTHDCOH3MWO","bundle":"https://pith.science/pith/5V7E47YJP32E3QBYTHDCOH3MWO/bundle.json","state":"https://pith.science/pith/5V7E47YJP32E3QBYTHDCOH3MWO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/5V7E47YJP32E3QBYTHDCOH3MWO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:5V7E47YJP32E3QBYTHDCOH3MWO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"059902c7a677407be77c299634a3a44300350a0544c0d47d34b7c0da96e19886","cross_cats_sorted":["cs.CR"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-27T14:44:36Z","title_canon_sha256":"7ecf64a86809945fb1c6b3c7e9aa46028b728410b785e1aa820fc75e1c66a0d2"},"schema_version":"1.0","source":{"id":"2605.28553","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.28553","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"arxiv_version","alias_value":"2605.28553v1","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.28553","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"pith_short_12","alias_value":"5V7E47YJP32E","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"pith_short_16","alias_value":"5V7E47YJP32E3QBY","created_at":"2026-05-28T02:04:56Z"},{"alias_kind":"pith_short_8","alias_value":"5V7E47YJ","created_at":"2026-05-28T02:04:56Z"}],"graph_snapshots":[{"event_id":"sha256:09cffe9aefbad2ade18e97d77c0dadf9a8c7147d8da1932a2be0bf821ccf3a60","target":"graph","created_at":"2026-05-28T02:04:56Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.28553/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"In this paper, we investigate whether refusal behavior can be predicted from LLM intermediate activations before decoding using linear probes trained on residual stream activations at each transformer block. We find that refusal is linearly decodable well before the final layer, indicating that safety-relevant behavior is represented in intermediate activations before output generation. To test whether this signal is actionable, we introduce Mechanistic AutoDAN, a probe-guided variant of AutoDAN that replaces full-model fitness evaluation with partial forward passes and probe-based scoring ins","authors_text":"Alberto Giaretta, Denis Kleyko, Matteo Gioele Collu, Matteo Zavatteri, Mauro Conti, Riccardo Conte, Roberto Confalonieri","cross_cats":["cs.CR"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-27T14:44:36Z","title":"Refusal Before Decoding: Detecting and Exploiting Refusal Signals in Intermediate LLM Activations"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.28553","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1f052699220f8608b7f512643254044bdcf76804a750405416a52400ed805c7c","target":"record","created_at":"2026-05-28T02:04:56Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"059902c7a677407be77c299634a3a44300350a0544c0d47d34b7c0da96e19886","cross_cats_sorted":["cs.CR"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-27T14:44:36Z","title_canon_sha256":"7ecf64a86809945fb1c6b3c7e9aa46028b728410b785e1aa820fc75e1c66a0d2"},"schema_version":"1.0","source":{"id":"2605.28553","kind":"arxiv","version":1}},"canonical_sha256":"ed7e4e7f097ef44dc03899c6271f6cb3992e0ae22bc5212b4ae9ec3316e57b02","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ed7e4e7f097ef44dc03899c6271f6cb3992e0ae22bc5212b4ae9ec3316e57b02","first_computed_at":"2026-05-28T02:04:56.202569Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-28T02:04:56.202569Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"hKu80T02mDMpLy2ZPLv2FK223r+F/j1Zq+Vqtzj3M2zU7fV9K4njV5n/k0ckiJPq8iGp/GamHJRNDGOSmhwmDg==","signature_status":"signed_v1","signed_at":"2026-05-28T02:04:56.203047Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.28553","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1f052699220f8608b7f512643254044bdcf76804a750405416a52400ed805c7c","sha256:09cffe9aefbad2ade18e97d77c0dadf9a8c7147d8da1932a2be0bf821ccf3a60"],"state_sha256":"9a0cc746d945574aedc79411478e6d7fd4516af428d58d37cc52e13a9ec66eba"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ZmL62HfDXdkuAylvOn88A8dURrTRzlDxIuziSRW1A/TS5Z+C7N0MpTgqo8vHnJ1RPj3Fd51v3eJhqoLiKy8hBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-31T06:20:03.925504Z","bundle_sha256":"bd6af119125ed1e2503fb30a513fd5afb3d1a73129c4613189bc345e44f7fc74"}}