{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:7OQTG7DPXQSRBMZERL4K4GVJX7","short_pith_number":"pith:7OQTG7DP","schema_version":"1.0","canonical_sha256":"fba1337c6fbc2510b3248af8ae1aa9bfe33a57b659fa95fdc3cde87278179284","source":{"kind":"arxiv","id":"2606.22686","version":1},"attestation_state":"computed","paper":{"title":"The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CR","authors_text":"Kartikeya Vats, Shivam Ratnakar","submitted_at":"2026-06-21T22:04:48Z","abstract_excerpt":"Modern Large Language Models (LLMs) rely on extensive safety alignment, yet the mechanistic basis of refusal remains opaque. In this work, we investigate whether safety compliance is a deep semantic decision or a manipulable linear feature. We introduce Contrastive Logit Steering (CLS), a zero-optimization framework that isolates the \"refusal direction\" by contrasting hidden states derived from safe and unrestricted system prompts. Unlike representation engineering methods that intervene on internal activations, CLS operates directly on the output distribution, serving as a diagnostic probe fo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.22686","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-06-21T22:04:48Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"8eca48b23fa370f13a4aa30f7a39324f113e8163cc2e7ebddee3d527adf6569f","abstract_canon_sha256":"eb70e8c249054e0e25eb270eaf6ae6afeb3a96ee94eec5255168f09cf92bd275"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:13:44.839841Z","signature_b64":"8AKG23//jN5hh0a2IAz31hNB8Y1b7npIu6SKnJ/oK/7MzlRysY4//H/XlF5CInFv8+VilmQWa3k7Qxk9uq7nDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fba1337c6fbc2510b3248af8ae1aa9bfe33a57b659fa95fdc3cde87278179284","last_reissued_at":"2026-06-23T02:13:44.839436Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:13:44.839436Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The Geometry of Refusal: Linear Instability in Safety-Aligned LLMs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CR","authors_text":"Kartikeya Vats, Shivam Ratnakar","submitted_at":"2026-06-21T22:04:48Z","abstract_excerpt":"Modern Large Language Models (LLMs) rely on extensive safety alignment, yet the mechanistic basis of refusal remains opaque. In this work, we investigate whether safety compliance is a deep semantic decision or a manipulable linear feature. We introduce Contrastive Logit Steering (CLS), a zero-optimization framework that isolates the \"refusal direction\" by contrasting hidden states derived from safe and unrestricted system prompts. Unlike representation engineering methods that intervene on internal activations, CLS operates directly on the output distribution, serving as a diagnostic probe fo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.22686","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.22686/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.22686","created_at":"2026-06-23T02:13:44.839494+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.22686v1","created_at":"2026-06-23T02:13:44.839494+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.22686","created_at":"2026-06-23T02:13:44.839494+00:00"},{"alias_kind":"pith_short_12","alias_value":"7OQTG7DPXQSR","created_at":"2026-06-23T02:13:44.839494+00:00"},{"alias_kind":"pith_short_16","alias_value":"7OQTG7DPXQSRBMZE","created_at":"2026-06-23T02:13:44.839494+00:00"},{"alias_kind":"pith_short_8","alias_value":"7OQTG7DP","created_at":"2026-06-23T02:13:44.839494+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7","json":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7.json","graph_json":"https://pith.science/api/pith-number/7OQTG7DPXQSRBMZERL4K4GVJX7/graph.json","events_json":"https://pith.science/api/pith-number/7OQTG7DPXQSRBMZERL4K4GVJX7/events.json","paper":"https://pith.science/paper/7OQTG7DP"},"agent_actions":{"view_html":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7","download_json":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7.json","view_paper":"https://pith.science/paper/7OQTG7DP","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.22686&json=true","fetch_graph":"https://pith.science/api/pith-number/7OQTG7DPXQSRBMZERL4K4GVJX7/graph.json","fetch_events":"https://pith.science/api/pith-number/7OQTG7DPXQSRBMZERL4K4GVJX7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7/action/storage_attestation","attest_author":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7/action/author_attestation","sign_citation":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7/action/citation_signature","submit_replication":"https://pith.science/pith/7OQTG7DPXQSRBMZERL4K4GVJX7/action/replication_record"}},"created_at":"2026-06-23T02:13:44.839494+00:00","updated_at":"2026-06-23T02:13:44.839494+00:00"}