{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:HSKNHNGDRTIAXV2QBJMCM5TZ6F","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"771ca615e3f8482df0a1d10148fdccbb529b2a2224f1bd8bfc733029907ebb18","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T03:40:07Z","title_canon_sha256":"06535b4219a6c259385195a93165002000bc862a61b2554439977fd12f7563f0"},"schema_version":"1.0","source":{"id":"2605.15239","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15239","created_at":"2026-05-20T00:05:47Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15239v1","created_at":"2026-05-20T00:05:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15239","created_at":"2026-05-20T00:05:47Z"},{"alias_kind":"pith_short_12","alias_value":"HSKNHNGDRTIA","created_at":"2026-05-20T00:05:47Z"},{"alias_kind":"pith_short_16","alias_value":"HSKNHNGDRTIAXV2Q","created_at":"2026-05-20T00:05:47Z"},{"alias_kind":"pith_short_8","alias_value":"HSKNHNGD","created_at":"2026-05-20T00:05:47Z"}],"graph_snapshots":[{"event_id":"sha256:c770ccc43114f44fc27dc74875eb396943981d948fc9bc8bd6bd888627991f7e","target":"graph","created_at":"2026-05-20T00:05:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across two reasoning-model families and five model scales, OPSA achieves a stronger safety--reasoning tradeoff than off-policy self-distillation and external-teacher distillation under matched data and full-parameter fine-tuning, with the largest gains on smaller models (+8.85 points on R1-Distill-1.5B and +5.49 points on Qwen3-0.6B)."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The privileged safety context must make the frozen teacher reliably safer than the student trajectory, and the teacher flip rate must identify contexts that activate latent safety reasoning rather than simply producing safe-looking surface demonstrations."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"On-policy self-distillation with teacher flip rate yields better safety-reasoning tradeoffs than off-policy or external-teacher baselines across model scales."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"On-policy self-distillation with privileged safety contexts reduces the safety tax while preserving reasoning in LLMs."}],"snapshot_sha256":"54b3700049602d99631e327e3d445ee6804bf5ede0b1ffcd939933e6493c496b"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"4121001448f6c3abe3ae29939fb1e86f67ee5e2c8494614811151e8ef4e09951"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T17:01:18.454612Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T16:36:29.146560Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T16:01:54.914131Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.824260Z","status":"skipped","version":"1.0.0"}],"endpoint":"/pith/2605.15239/integrity.json","findings":[],"snapshot_sha256":"d3cca61b1bd0894d45f55da34d27e13ecbfbd5de65b6f7906e5bd71d0c9ae0d3","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Safety alignment often improves robustness to harmful queries at the cost of reasoning ability, a tradeoff known as the safety tax. A common cause is distributional mismatch: supervised fine-tuning trains the target model on safety demonstrations produced by humans, external models, or fixed self-generated traces, rather than on trajectories sampled from its own policy. We identify off-policy training mismatch as a second source of this tax and study on-policy self-distillation for safety alignment, which we call OPSA. The model generates its own rollouts and receives dense per-token KL superv","authors_text":"Haz Sameen Shahgir, Hui Liu, Longxuan Yu, N. Benjamin Erichson, Yue Dong, Yu Fu, Zhipeng Wei","cross_cats":[],"headline":"On-policy self-distillation with privileged safety contexts reduces the safety tax while preserving reasoning in LLMs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T03:40:07Z","title":"Reducing the Safety Tax in LLM Safety Alignment with On-Policy Self-Distillation"},"references":{"count":42,"internal_anchors":18,"resolved_work":42,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Advances in neural information processing systems , volume=","work_id":"c25e8154-fab2-455c-8a26-56e40aed5d2b","year":null},{"cited_arxiv_id":"2204.05862","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","year":null},{"cited_arxiv_id":"2407.04295","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","work_id":"0ee7fc45-ae61-432b-83ac-f1d93ccd88fb","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Findings of the Association for Computational Linguistics: ACL 2025 , pages=","work_id":"923339bc-bc78-421a-9e00-8feeec5339ad","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Proceedings of the AAAI Conference on Artificial Intelligence , volume=","work_id":"e125f746-28c3-4ec3-a3a8-dc26bdb4d96b","year":null}],"snapshot_sha256":"5b79a4cf4c3cfd6e9c0e3c8cd64f837cef4b31a5978cea6ec13dd41820ca784d"},"source":{"id":"2605.15239","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T16:29:40.784497Z","id":"b5f8726b-14c7-4c74-af5a-5f9d04bf12fe","model_set":{"reader":"grok-4.3"},"one_line_summary":"On-policy self-distillation with teacher flip rate yields better safety-reasoning tradeoffs than off-policy or external-teacher baselines across model scales.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"On-policy self-distillation with privileged safety contexts reduces the safety tax while preserving reasoning in LLMs.","strongest_claim":"Across two reasoning-model families and five model scales, OPSA achieves a stronger safety--reasoning tradeoff than off-policy self-distillation and external-teacher distillation under matched data and full-parameter fine-tuning, with the largest gains on smaller models (+8.85 points on R1-Distill-1.5B and +5.49 points on Qwen3-0.6B).","weakest_assumption":"The privileged safety context must make the frozen teacher reliably safer than the student trajectory, and the teacher flip rate must identify contexts that activate latent safety reasoning rather than simply producing safe-looking surface demonstrations."}},"verdict_id":"b5f8726b-14c7-4c74-af5a-5f9d04bf12fe"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:10f2faa37445bef347bfdf902cca7fd82fe75afb3534f2fd54bba4a55d047c1f","target":"record","created_at":"2026-05-20T00:05:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"771ca615e3f8482df0a1d10148fdccbb529b2a2224f1bd8bfc733029907ebb18","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T03:40:07Z","title_canon_sha256":"06535b4219a6c259385195a93165002000bc862a61b2554439977fd12f7563f0"},"schema_version":"1.0","source":{"id":"2605.15239","kind":"arxiv","version":1}},"canonical_sha256":"3c94d3b4c38cd00bd7500a58267679f17ea23e2cd804f740998aefc07e98207d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3c94d3b4c38cd00bd7500a58267679f17ea23e2cd804f740998aefc07e98207d","first_computed_at":"2026-05-20T00:05:47.726170Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:05:47.726170Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"6B4TAP5Q65lE9qpHWGnRsijuyaQDC+RI0IigAVv4tn5I65DoiixHcP0IRZI18Fa3VZdo23NDOywFFYkkkZnNCg==","signature_status":"signed_v1","signed_at":"2026-05-20T00:05:47.727021Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.15239","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:10f2faa37445bef347bfdf902cca7fd82fe75afb3534f2fd54bba4a55d047c1f","sha256:c770ccc43114f44fc27dc74875eb396943981d948fc9bc8bd6bd888627991f7e"],"state_sha256":"0b87381aa2128ef6ad894eb7d345750122d6c9aff00df5dd31533be1d575aad9"}