{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:VLTZ5K6IQS5OWRR7W2GUJZ3CN7","short_pith_number":"pith:VLTZ5K6I","canonical_record":{"source":{"id":"2602.12125","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:14:29Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"c83c039db1d146c473e92bb6ffdfbf116c0371a560420bdf59d3b321fe791ae1","abstract_canon_sha256":"658b12c53d7facb45243c962bbda4e0f8181a5ddb534c7064a61c98d6b950c54"},"schema_version":"1.0"},"canonical_sha256":"aae79eabc884baeb463fb68d44e7626ff8035ae2ab35b88ce12b0bdd659e734e","source":{"kind":"arxiv","id":"2602.12125","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.12125","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2602.12125v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12125","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"VLTZ5K6IQS5O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VLTZ5K6IQS5OWRR7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VLTZ5K6I","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:VLTZ5K6IQS5OWRR7W2GUJZ3CN7","target":"record","payload":{"canonical_record":{"source":{"id":"2602.12125","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:14:29Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"c83c039db1d146c473e92bb6ffdfbf116c0371a560420bdf59d3b321fe791ae1","abstract_canon_sha256":"658b12c53d7facb45243c962bbda4e0f8181a5ddb534c7064a61c98d6b950c54"},"schema_version":"1.0"},"canonical_sha256":"aae79eabc884baeb463fb68d44e7626ff8035ae2ab35b88ce12b0bdd659e734e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.007869Z","signature_b64":"bpSyUOQJBZkmultIe9wz/PnVnEfS9xni2lhnGBvqxlwV58xmDWvfYggsK6H9ngBxgpL6gJ9F1/7B7ubUpf90Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"aae79eabc884baeb463fb68d44e7626ff8035ae2ab35b88ce12b0bdd659e734e","last_reissued_at":"2026-05-17T23:38:49.007259Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.007259Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.12125","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"37YUz6kwZCmuCEgsY3Qq+5Nc1H+Jj31lgwp/cvO4dMcj/vFjrasF86xGrgrlMm6pujQIA5Z9JsPs2nYCMd2ZAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T11:13:41.710758Z"},"content_sha256":"cf66b04229ed8d5d16f67b62cc6c008ab18f33fbdb9237ed297d8b6c017f665e","schema_version":"1.0","event_id":"sha256:cf66b04229ed8d5d16f67b62cc6c008ab18f33fbdb9237ed297d8b6c017f665e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:VLTZ5K6IQS5OWRR7W2GUJZ3CN7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Learning beyond Teacher: Generalized On-Policy Distillation with Reward Extrapolation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Generalized on-policy distillation using reward extrapolation enables students to surpass their teachers when merging domain experts.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Kai Yang, Ruobing Xie, Saiyong Yang, Weijie Liu, Wenkai Yang, Yankai Lin","submitted_at":"2026-02-12T16:14:29Z","abstract_excerpt":"On-policy distillation (OPD), which aligns the student with the teacher's logit distribution on student-generated trajectories, has demonstrated strong empirical gains in improving student performance and often outperforms off-policy distillation and reinforcement learning (RL) paradigms. In this work, we first theoretically show that OPD is a special case of dense KL-constrained RL where the reward function and the KL regularization are always weighted equally and the reference model can by any model. Then, we propose the Generalized On-Policy Distillation (G-OPD) framework, which extends the"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Setting the reward scaling factor to be greater than 1 (i.e., reward extrapolation), which we term ExOPD, consistently improves over standard OPD across a range of teacher-student size pairings. In particular, in the setting where we merge the knowledge from different domain experts... the student to even surpass the teacher's performance boundary and outperform the domain teachers.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The reward correction step in strong-to-weak distillation assumes access to the teacher's pre-RL base model and that this choice yields a more accurate reward signal than other references.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Generalized on-policy distillation with reward scaling above one (ExOPD) lets student models surpass teacher performance when merging domain experts on math and code tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Generalized on-policy distillation using reward extrapolation enables students to surpass their teachers when merging domain experts.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ad48f43f7112a41ea91d7a8bfdb495494e854528feb3b871d20afabecf052292"},"source":{"id":"2602.12125","kind":"arxiv","version":2},"verdict":{"id":"cbb9951b-f18b-4c77-95c7-b890a755b4f4","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T05:06:43.878258Z","strongest_claim":"Setting the reward scaling factor to be greater than 1 (i.e., reward extrapolation), which we term ExOPD, consistently improves over standard OPD across a range of teacher-student size pairings. In particular, in the setting where we merge the knowledge from different domain experts... the student to even surpass the teacher's performance boundary and outperform the domain teachers.","one_line_summary":"Generalized on-policy distillation with reward scaling above one (ExOPD) lets student models surpass teacher performance when merging domain experts on math and code tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The reward correction step in strong-to-weak distillation assumes access to the teacher's pre-RL base model and that this choice yields a more accurate reward signal than other references.","pith_extraction_headline":"Generalized on-policy distillation using reward extrapolation enables students to surpass their teachers when merging domain experts."},"references":{"count":26,"sample":[{"doi":"","year":2024,"title":"AI-MO. Aime 2024.https://huggingface.co/datasets/AI-MO/aimo-validation-aime,","work_id":"b9c5f7ac-b9f1-4a93-863c-5560d285fb45","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"InternLM2 Technical Report","work_id":"dfa13e0e-1c3c-4fb6-943d-a19945bacdbe","ref_index":2,"cited_arxiv_id":"2403.17297","is_internal_anchor":true},{"doi":"","year":null,"title":"Process Reinforcement through Implicit Rewards","work_id":"c31a2126-86f9-44f3-91f3-208d0fc1463a","ref_index":3,"cited_arxiv_id":"2502.01456","is_internal_anchor":true},{"doi":"","year":2023,"title":"Enhancing chat language models by scaling high-quality instructional conversations","work_id":"eeae75b3-2ae1-42b9-a8c0-08a6fcb40b06","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"RLHF Workflow: From Reward Modeling to Online RLHF","work_id":"c7ff0d97-6f12-4146-bf55-c37c42517893","ref_index":5,"cited_arxiv_id":"2405.07863","is_internal_anchor":true}],"resolved_work":26,"snapshot_sha256":"7ea7c17db8a469fc9d9a0ca814b75f77e8f144c55636fd00eb8d7b44b50d2c36","internal_anchors":18},"formal_canon":{"evidence_count":3,"snapshot_sha256":"afdeebbfa0fd23006cf147493dd318efa1ae85418a8cdde2f8066e5b2b1e7a02"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"cbb9951b-f18b-4c77-95c7-b890a755b4f4"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RfqzzuEnizDM7CxvnUMsGjs8y5YImupQFVK8MtZZmZTl+0Etk6p95jAR/sciGHCoBMTqIo2kq7qAG8x8DS3KCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T11:13:41.711834Z"},"content_sha256":"8388d31857f10d26ce020228f0992cb82d8f6a7127f771243a3eab1b57f46377","schema_version":"1.0","event_id":"sha256:8388d31857f10d26ce020228f0992cb82d8f6a7127f771243a3eab1b57f46377"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7/bundle.json","state_url":"https://pith.science/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T11:13:41Z","links":{"resolver":"https://pith.science/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7","bundle":"https://pith.science/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7/bundle.json","state":"https://pith.science/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VLTZ5K6IQS5OWRR7W2GUJZ3CN7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:VLTZ5K6IQS5OWRR7W2GUJZ3CN7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"658b12c53d7facb45243c962bbda4e0f8181a5ddb534c7064a61c98d6b950c54","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:14:29Z","title_canon_sha256":"c83c039db1d146c473e92bb6ffdfbf116c0371a560420bdf59d3b321fe791ae1"},"schema_version":"1.0","source":{"id":"2602.12125","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.12125","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2602.12125v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.12125","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"VLTZ5K6IQS5O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VLTZ5K6IQS5OWRR7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VLTZ5K6I","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8388d31857f10d26ce020228f0992cb82d8f6a7127f771243a3eab1b57f46377","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Setting the reward scaling factor to be greater than 1 (i.e., reward extrapolation), which we term ExOPD, consistently improves over standard OPD across a range of teacher-student size pairings. In particular, in the setting where we merge the knowledge from different domain experts... the student to even surpass the teacher's performance boundary and outperform the domain teachers."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The reward correction step in strong-to-weak distillation assumes access to the teacher's pre-RL base model and that this choice yields a more accurate reward signal than other references."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Generalized on-policy distillation with reward scaling above one (ExOPD) lets student models surpass teacher performance when merging domain experts on math and code tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Generalized on-policy distillation using reward extrapolation enables students to surpass their teachers when merging domain experts."}],"snapshot_sha256":"ad48f43f7112a41ea91d7a8bfdb495494e854528feb3b871d20afabecf052292"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"afdeebbfa0fd23006cf147493dd318efa1ae85418a8cdde2f8066e5b2b1e7a02"},"paper":{"abstract_excerpt":"On-policy distillation (OPD), which aligns the student with the teacher's logit distribution on student-generated trajectories, has demonstrated strong empirical gains in improving student performance and often outperforms off-policy distillation and reinforcement learning (RL) paradigms. In this work, we first theoretically show that OPD is a special case of dense KL-constrained RL where the reward function and the KL regularization are always weighted equally and the reference model can by any model. Then, we propose the Generalized On-Policy Distillation (G-OPD) framework, which extends the","authors_text":"Kai Yang, Ruobing Xie, Saiyong Yang, Weijie Liu, Wenkai Yang, Yankai Lin","cross_cats":["cs.AI","cs.CL"],"headline":"Generalized on-policy distillation using reward extrapolation enables students to surpass their teachers when merging domain experts.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:14:29Z","title":"Learning beyond Teacher: Generalized On-Policy Distillation with Reward Extrapolation"},"references":{"count":26,"internal_anchors":18,"resolved_work":26,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"AI-MO. Aime 2024.https://huggingface.co/datasets/AI-MO/aimo-validation-aime,","work_id":"b9c5f7ac-b9f1-4a93-863c-5560d285fb45","year":2024},{"cited_arxiv_id":"2403.17297","doi":"","is_internal_anchor":true,"ref_index":2,"title":"InternLM2 Technical Report","work_id":"dfa13e0e-1c3c-4fb6-943d-a19945bacdbe","year":null},{"cited_arxiv_id":"2502.01456","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Process Reinforcement through Implicit Rewards","work_id":"c31a2126-86f9-44f3-91f3-208d0fc1463a","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Enhancing chat language models by scaling high-quality instructional conversations","work_id":"eeae75b3-2ae1-42b9-a8c0-08a6fcb40b06","year":2023},{"cited_arxiv_id":"2405.07863","doi":"","is_internal_anchor":true,"ref_index":5,"title":"RLHF Workflow: From Reward Modeling to Online RLHF","work_id":"c7ff0d97-6f12-4146-bf55-c37c42517893","year":null}],"snapshot_sha256":"7ea7c17db8a469fc9d9a0ca814b75f77e8f144c55636fd00eb8d7b44b50d2c36"},"source":{"id":"2602.12125","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T05:06:43.878258Z","id":"cbb9951b-f18b-4c77-95c7-b890a755b4f4","model_set":{"reader":"grok-4.3"},"one_line_summary":"Generalized on-policy distillation with reward scaling above one (ExOPD) lets student models surpass teacher performance when merging domain experts on math and code tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Generalized on-policy distillation using reward extrapolation enables students to surpass their teachers when merging domain experts.","strongest_claim":"Setting the reward scaling factor to be greater than 1 (i.e., reward extrapolation), which we term ExOPD, consistently improves over standard OPD across a range of teacher-student size pairings. In particular, in the setting where we merge the knowledge from different domain experts... the student to even surpass the teacher's performance boundary and outperform the domain teachers.","weakest_assumption":"The reward correction step in strong-to-weak distillation assumes access to the teacher's pre-RL base model and that this choice yields a more accurate reward signal than other references."}},"verdict_id":"cbb9951b-f18b-4c77-95c7-b890a755b4f4"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:cf66b04229ed8d5d16f67b62cc6c008ab18f33fbdb9237ed297d8b6c017f665e","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"658b12c53d7facb45243c962bbda4e0f8181a5ddb534c7064a61c98d6b950c54","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:14:29Z","title_canon_sha256":"c83c039db1d146c473e92bb6ffdfbf116c0371a560420bdf59d3b321fe791ae1"},"schema_version":"1.0","source":{"id":"2602.12125","kind":"arxiv","version":2}},"canonical_sha256":"aae79eabc884baeb463fb68d44e7626ff8035ae2ab35b88ce12b0bdd659e734e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"aae79eabc884baeb463fb68d44e7626ff8035ae2ab35b88ce12b0bdd659e734e","first_computed_at":"2026-05-17T23:38:49.007259Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.007259Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"bpSyUOQJBZkmultIe9wz/PnVnEfS9xni2lhnGBvqxlwV58xmDWvfYggsK6H9ngBxgpL6gJ9F1/7B7ubUpf90Dw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.007869Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.12125","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:cf66b04229ed8d5d16f67b62cc6c008ab18f33fbdb9237ed297d8b6c017f665e","sha256:8388d31857f10d26ce020228f0992cb82d8f6a7127f771243a3eab1b57f46377"],"state_sha256":"1c3a38333372586fb7bab4ac84a6ab5b6624bde1b9330d614a4636f0dd4699c3"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nzTiwY3QerAvVlD5EbplH5z72d5HlpVBb+/23TuPFFLZflilGO3avi8LC+eQr62sDp6uZEFknOcEjVugJFTUCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T11:13:41.716563Z","bundle_sha256":"5c7a7f36b0e4e1463da02dde12063fdef97b5907b2ce53912d467e26044a9427"}}