{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:3ZAK63EJXKY5D2UZPLCZM3PUMN","short_pith_number":"pith:3ZAK63EJ","schema_version":"1.0","canonical_sha256":"de40af6c89bab1d1ea997ac5966df46358ad9897dcee5fdb409fc775af3e9699","source":{"kind":"arxiv","id":"2605.13255","version":1},"attestation_state":"computed","paper":{"title":"Respecting Self-Uncertainty in On-Policy Self-Distillation for Efficient LLM Reasoning","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"An entropy confidence gate that down-weights uncertain tokens improves the accuracy-length trade-off in on-policy self-distillation for LLM reasoning.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Conghui He, Junlong Ke, Linfeng Zhang, Weijia Li, Zichen Wen","submitted_at":"2026-05-13T09:38:20Z","abstract_excerpt":"On-policy self-distillation trains a reasoning model on its own rollouts while a teacher, often the same model conditioned on privileged context, provides dense token-level supervision. Existing objectives typically weight the teacher's token-level signal uniformly across a chain-of-thought sequence, despite substantial variation in the entropy of the teacher's predictive distribution. We propose EGRSD (Entropy-Guided Reinforced Self-Distillation), which unifies token-level updates through three signals: a reward-grounded direction, a teacher-student likelihood-ratio magnitude, and the propose"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.13255","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T09:38:20Z","cross_cats_sorted":[],"title_canon_sha256":"3fd4635281c5f14f852c07633698fc5e65cf60e63605afa04e829ceeb7619bef","abstract_canon_sha256":"00baede65b540590aefdf9d8125b22901bb17af42eb49249664b3b492e4e4f1d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:49.396999Z","signature_b64":"3kAVrKlhHjS/Pmic2Pujhsol6UMsaoWIsGMma1TriBp+/7Q/VeD52i3tiD3/jZpV0R7dRdyb2VxHfawX+ci/Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"de40af6c89bab1d1ea997ac5966df46358ad9897dcee5fdb409fc775af3e9699","last_reissued_at":"2026-05-18T02:44:49.396514Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:49.396514Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Respecting Self-Uncertainty in On-Policy Self-Distillation for Efficient LLM Reasoning","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"An entropy confidence gate that down-weights uncertain tokens improves the accuracy-length trade-off in on-policy self-distillation for LLM reasoning.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Conghui He, Junlong Ke, Linfeng Zhang, Weijia Li, Zichen Wen","submitted_at":"2026-05-13T09:38:20Z","abstract_excerpt":"On-policy self-distillation trains a reasoning model on its own rollouts while a teacher, often the same model conditioned on privileged context, provides dense token-level supervision. Existing objectives typically weight the teacher's token-level signal uniformly across a chain-of-thought sequence, despite substantial variation in the entropy of the teacher's predictive distribution. We propose EGRSD (Entropy-Guided Reinforced Self-Distillation), which unifies token-level updates through three signals: a reward-grounded direction, a teacher-student likelihood-ratio magnitude, and the propose"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments with Qwen3-4B and Qwen3-8B in thinking mode show that EGRSD and CL-EGRSD advance the accuracy-length frontier among the compared trainable methods.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That selectively down-weighting high-entropy tokens via the teacher-entropy confidence gate improves net reasoning quality without discarding critical information that only appears in uncertain positions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"EGRSD and CL-EGRSD advance the accuracy-length frontier in LLM reasoning by entropy-guided weighting of token-level distillation signals from the teacher.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"An entropy confidence gate that down-weights uncertain tokens improves the accuracy-length trade-off in on-policy self-distillation for LLM reasoning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a9dc58b3043a88cb564b735afd95a5ea212f26c7f48f043b575c2a4bccf23389"},"source":{"id":"2605.13255","kind":"arxiv","version":1},"verdict":{"id":"d9dc5cd5-62d7-400d-8e20-7d6319de3e6c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T19:47:16.821590Z","strongest_claim":"Experiments with Qwen3-4B and Qwen3-8B in thinking mode show that EGRSD and CL-EGRSD advance the accuracy-length frontier among the compared trainable methods.","one_line_summary":"EGRSD and CL-EGRSD advance the accuracy-length frontier in LLM reasoning by entropy-guided weighting of token-level distillation signals from the teacher.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That selectively down-weighting high-entropy tokens via the teacher-entropy confidence gate improves net reasoning quality without discarding critical information that only appears in uncertain positions.","pith_extraction_headline":"An entropy confidence gate that down-weights uncertain tokens improves the accuracy-length trade-off in on-policy self-distillation for LLM reasoning."},"references":{"count":27,"sample":[{"doi":"","year":null,"title":"arXiv preprint arXiv:2505.16400 , year=","work_id":"428ad314-c120-41da-9db7-b8bc1918fffb","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","ref_index":2,"cited_arxiv_id":"2110.14168","is_internal_anchor":true},{"doi":"","year":null,"title":"OpenThoughts: Data Recipes for Reasoning Models","work_id":"c7acbe41-27a0-4773-a7be-8f08d86cdf21","ref_index":3,"cited_arxiv_id":"2506.04178","is_internal_anchor":true},{"doi":"","year":null,"title":"Entropy-aware on-policy distillation of language models","work_id":"7dccbe12-e2aa-48d8-9b76-5521ccf02668","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Why Does Self-Distillation (Sometimes) Degrade the Reasoning Capability of LLMs?","work_id":"8df6a2d1-d890-48ae-af85-c11643a91097","ref_index":5,"cited_arxiv_id":"2603.24472","is_internal_anchor":true}],"resolved_work":27,"snapshot_sha256":"359ab656fd170e83e723876231863bf36e6850496e2ccddb02fb6eec773933be","internal_anchors":10},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13255","created_at":"2026-05-18T02:44:49.396597+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13255v1","created_at":"2026-05-18T02:44:49.396597+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13255","created_at":"2026-05-18T02:44:49.396597+00:00"},{"alias_kind":"pith_short_12","alias_value":"3ZAK63EJXKY5","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"3ZAK63EJXKY5D2UZ","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"3ZAK63EJ","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.18141","citing_title":"A Brief Overview: On-Policy Self-Distillation In Large Language Models","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18141","citing_title":"A Brief Overview: On-Policy Self-Distillation In Large Language Models","ref_index":21,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN","json":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN.json","graph_json":"https://pith.science/api/pith-number/3ZAK63EJXKY5D2UZPLCZM3PUMN/graph.json","events_json":"https://pith.science/api/pith-number/3ZAK63EJXKY5D2UZPLCZM3PUMN/events.json","paper":"https://pith.science/paper/3ZAK63EJ"},"agent_actions":{"view_html":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN","download_json":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN.json","view_paper":"https://pith.science/paper/3ZAK63EJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13255&json=true","fetch_graph":"https://pith.science/api/pith-number/3ZAK63EJXKY5D2UZPLCZM3PUMN/graph.json","fetch_events":"https://pith.science/api/pith-number/3ZAK63EJXKY5D2UZPLCZM3PUMN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN/action/storage_attestation","attest_author":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN/action/author_attestation","sign_citation":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN/action/citation_signature","submit_replication":"https://pith.science/pith/3ZAK63EJXKY5D2UZPLCZM3PUMN/action/replication_record"}},"created_at":"2026-05-18T02:44:49.396597+00:00","updated_at":"2026-05-18T02:44:49.396597+00:00"}