{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:ZFDOCI4CAC5H65KJHKNDEEKBSE","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"385c3a7d71ce11986b56d5a4328cfa10769913f9107431bc9118838bd71f8371","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-01T17:38:12Z","title_canon_sha256":"00467e937f3f7d6641e0ffade53dc1ab80bff93a4701cc15b1c6345d32405f99"},"schema_version":"1.0","source":{"id":"2606.02530","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.02530","created_at":"2026-06-02T03:05:08Z"},{"alias_kind":"arxiv_version","alias_value":"2606.02530v1","created_at":"2026-06-02T03:05:08Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.02530","created_at":"2026-06-02T03:05:08Z"},{"alias_kind":"pith_short_12","alias_value":"ZFDOCI4CAC5H","created_at":"2026-06-02T03:05:08Z"},{"alias_kind":"pith_short_16","alias_value":"ZFDOCI4CAC5H65KJ","created_at":"2026-06-02T03:05:08Z"},{"alias_kind":"pith_short_8","alias_value":"ZFDOCI4C","created_at":"2026-06-02T03:05:08Z"}],"graph_snapshots":[{"event_id":"sha256:49aa990530ff86a2db070066ecae39f3d1e8767b7ff7c4d3da233075f29ba1a9","target":"graph","created_at":"2026-06-02T03:05:08Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.02530/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Aligning Large Language Models (LLMs) with human values often degrades their general capabilities, termed the alignment tax. Existing methods mitigate this by balancing dual objectives, which heavily rely on massive general-purpose data or auxiliary reward models.\n  In this paper, we argue that, because safety features are inherently sparse within the output distribution, alignment requires localized modifications rather than global trade-offs. To this end, we propose SafeSteer, which performs on-policy distillation confined to safety tokens. First, we construct a safety teacher via activation","authors_text":"Hao Li, Hao Wang, Jin-Ge Yao, Jingkun An, Lei Sha, Lijun Li, Pengyu Zhu, Rui Li, Wendi Feng, Yesheng Liu, Zijun Song","cross_cats":["cs.CL"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-01T17:38:12Z","title":"SafeSteer: Localized On-Policy Distillation for Efficient Safety Alignment"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.02530","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:34f3529a5e3153ef748832ba21fbedcce62fafc095a225af7c92e712f99ccb8f","target":"record","created_at":"2026-06-02T03:05:08Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"385c3a7d71ce11986b56d5a4328cfa10769913f9107431bc9118838bd71f8371","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-01T17:38:12Z","title_canon_sha256":"00467e937f3f7d6641e0ffade53dc1ab80bff93a4701cc15b1c6345d32405f99"},"schema_version":"1.0","source":{"id":"2606.02530","kind":"arxiv","version":1}},"canonical_sha256":"c946e1238200ba7f75493a9a321141913c0718985a6e5c6edd2c9c450736b937","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c946e1238200ba7f75493a9a321141913c0718985a6e5c6edd2c9c450736b937","first_computed_at":"2026-06-02T03:05:08.742000Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T03:05:08.742000Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"f1aJjxHusEhHkj4tyGZsLJGyBbS8Xci2bt2ix7ZB1MWnkJO1ATmJDza5Q7VhKHFW61EOeUFBbGVzAUSvH9seCA==","signature_status":"signed_v1","signed_at":"2026-06-02T03:05:08.742404Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.02530","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:34f3529a5e3153ef748832ba21fbedcce62fafc095a225af7c92e712f99ccb8f","sha256:49aa990530ff86a2db070066ecae39f3d1e8767b7ff7c4d3da233075f29ba1a9"],"state_sha256":"4a243290ae17e7436378c65865d5c3c2862bb32dfaf58c479ab94f286a976de4"}