{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:43MK434FGV2PB6DWQ25Y4V6ULU","short_pith_number":"pith:43MK434F","canonical_record":{"source":{"id":"2605.12652","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:44Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"e19006a81b64cfbc13bb3059452dcfc0320e92d0a44498802fc5f562d060bf42","abstract_canon_sha256":"593fa5067f258c404222fd96a88c5f1b645eb03f85ac6ce256a6dbdd8e7b3fcc"},"schema_version":"1.0"},"canonical_sha256":"e6d8ae6f853574f0f87686bb8e57d45d14eaa6d5928503e5034afa5da6273372","source":{"kind":"arxiv","id":"2605.12652","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12652","created_at":"2026-05-18T03:09:50Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12652v1","created_at":"2026-05-18T03:09:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12652","created_at":"2026-05-18T03:09:50Z"},{"alias_kind":"pith_short_12","alias_value":"43MK434FGV2P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"43MK434FGV2PB6DW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"43MK434F","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:43MK434FGV2PB6DWQ25Y4V6ULU","target":"record","payload":{"canonical_record":{"source":{"id":"2605.12652","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:44Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"e19006a81b64cfbc13bb3059452dcfc0320e92d0a44498802fc5f562d060bf42","abstract_canon_sha256":"593fa5067f258c404222fd96a88c5f1b645eb03f85ac6ce256a6dbdd8e7b3fcc"},"schema_version":"1.0"},"canonical_sha256":"e6d8ae6f853574f0f87686bb8e57d45d14eaa6d5928503e5034afa5da6273372","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:50.760718Z","signature_b64":"xGUYLES4PBvu29hKK5j7u+XI6eeuZ4sk54uJw3CNR8r+2JiWJPeW7M9wmVivus0VUNqzxO5wMRM9Boba3SZCCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e6d8ae6f853574f0f87686bb8e57d45d14eaa6d5928503e5034afa5da6273372","last_reissued_at":"2026-05-18T03:09:50.759655Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:50.759655Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.12652","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Xgza4YKEUQ7dUMMCOTELo/N+7NEu0mIbIdNx1is/cliI7O0/cwOpQq7yio0YvOdh6p0aJWUBpQB4T+2xDvxwCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:27:49.873173Z"},"content_sha256":"dc1561d4de2f4fcbf917b7eb6580b900e895c028d08777159e473b552fd1084b","schema_version":"1.0","event_id":"sha256:dc1561d4de2f4fcbf917b7eb6580b900e895c028d08777159e473b552fd1084b"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:43MK434FGV2PB6DWQ25Y4V6ULU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Multi-Rollout On-Policy Distillation via Peer Successes and Failures","license":"http://creativecommons.org/licenses/by/4.0/","headline":"By conditioning teacher signals on both successful and failed peer rollouts from the same prompt, multi-rollout on-policy distillation supplies denser and better-aligned supervision than single-rollout baselines.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Chen Henry Wu, Gaurav Mittal, Haixin Wang, Matt Fredrikson, Ruowang Zhang, Weichen Yu, Xiaomin Li, Xiaoze Liu, Yinyi Luo, Yizhou Zhao, Yu Hu","submitted_at":"2026-05-12T18:57:44Z","abstract_excerpt":"Large language models are often post-trained with sparse verifier rewards, which indicate whether a sampled trajectory succeeds but provide limited guidance about where reasoning succeeds or fails. On-policy distillation (OPD) offers denser token-level supervision by training on student-generated trajectories, yet existing methods typically distill each rollout independently and ignore the other attempts sampled for the same prompt. We introduce Multi-Rollout On-Policy Distillation (MOPD), a peer-conditioned distillation framework that uses the student's local rollout group to construct more i"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments on competitive programming, mathematical reasoning, scientific question answering, and tool-use benchmarks show that MOPD consistently improves over standard on-policy baselines. Further teacher-signal analysis shows that mixed success-failure contexts better align teacher scores with verifier rewards.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the student's local rollout group can be used to construct teacher signals that are both more informative and better aligned with external verifier rewards without introducing new biases from the peer selection process.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MOPD improves on-policy distillation for LLMs by using peer successes for positive patterns and failures for negative examples to create more informative teacher signals.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"By conditioning teacher signals on both successful and failed peer rollouts from the same prompt, multi-rollout on-policy distillation supplies denser and better-aligned supervision than single-rollout baselines.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"835acb4c23c54f65c071ff687f5c042b1cb909c0963cfc043d84b681c660cbe4"},"source":{"id":"2605.12652","kind":"arxiv","version":1},"verdict":{"id":"82901114-83a2-4a1e-ac96-196e2e5835aa","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T21:25:53.077522Z","strongest_claim":"Experiments on competitive programming, mathematical reasoning, scientific question answering, and tool-use benchmarks show that MOPD consistently improves over standard on-policy baselines. Further teacher-signal analysis shows that mixed success-failure contexts better align teacher scores with verifier rewards.","one_line_summary":"MOPD improves on-policy distillation for LLMs by using peer successes for positive patterns and failures for negative examples to create more informative teacher signals.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the student's local rollout group can be used to construct teacher signals that are both more informative and better aligned with external verifier rewards without introducing new biases from the peer selection process.","pith_extraction_headline":"By conditioning teacher signals on both successful and failed peer rollouts from the same prompt, multi-rollout on-policy distillation supplies denser and better-aligned supervision than single-rollout baselines."},"references":{"count":79,"sample":[{"doi":"","year":null,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":1,"cited_arxiv_id":"2107.03374","is_internal_anchor":true},{"doi":"","year":null,"title":"Distilling the Knowledge in a Neural Network","work_id":"d927ab1f-17b8-4002-9d09-c3d55764fbad","ref_index":2,"cited_arxiv_id":"1503.02531","is_internal_anchor":true},{"doi":"","year":2016,"title":"Rush , title =","work_id":"713931d3-c34c-4aee-980b-05473efb3a27","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"A Survey on Knowledge Distillation of Large Language Models","work_id":"b1dcdadf-875e-4695-8fcf-0907c69302f3","ref_index":4,"cited_arxiv_id":"2402.13116","is_internal_anchor":true},{"doi":"","year":null,"title":"arXiv preprint arXiv:2305.15717 , year=","work_id":"f843d86a-7fd6-4b0d-bf8b-f1ad3fbc3f04","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":79,"snapshot_sha256":"7dbb31d7c912d229c9e781048f221011aa479c2021626c6b9b313266c048a976","internal_anchors":23},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1587235cc0af9ff5c49dfac8cf111a9736b7616a5a9a92ffc4dcffb84f635e3d"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"82901114-83a2-4a1e-ac96-196e2e5835aa"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HmbM/DTZJn7XlzFWl/jtlqQ/vBLkeWIj3c/6QbiVm5lSxOgxuzAnroiXLrx+mg8CkGlVyrFJXh/hI+RpKu8wBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:27:49.873792Z"},"content_sha256":"141198e40ddd0655c8b424f96bf163e159dac2437dfb672000cf64b891418c3f","schema_version":"1.0","event_id":"sha256:141198e40ddd0655c8b424f96bf163e159dac2437dfb672000cf64b891418c3f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/43MK434FGV2PB6DWQ25Y4V6ULU/bundle.json","state_url":"https://pith.science/pith/43MK434FGV2PB6DWQ25Y4V6ULU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/43MK434FGV2PB6DWQ25Y4V6ULU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T15:27:49Z","links":{"resolver":"https://pith.science/pith/43MK434FGV2PB6DWQ25Y4V6ULU","bundle":"https://pith.science/pith/43MK434FGV2PB6DWQ25Y4V6ULU/bundle.json","state":"https://pith.science/pith/43MK434FGV2PB6DWQ25Y4V6ULU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/43MK434FGV2PB6DWQ25Y4V6ULU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:43MK434FGV2PB6DWQ25Y4V6ULU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"593fa5067f258c404222fd96a88c5f1b645eb03f85ac6ce256a6dbdd8e7b3fcc","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:44Z","title_canon_sha256":"e19006a81b64cfbc13bb3059452dcfc0320e92d0a44498802fc5f562d060bf42"},"schema_version":"1.0","source":{"id":"2605.12652","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12652","created_at":"2026-05-18T03:09:50Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12652v1","created_at":"2026-05-18T03:09:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12652","created_at":"2026-05-18T03:09:50Z"},{"alias_kind":"pith_short_12","alias_value":"43MK434FGV2P","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"43MK434FGV2PB6DW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"43MK434F","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:141198e40ddd0655c8b424f96bf163e159dac2437dfb672000cf64b891418c3f","target":"graph","created_at":"2026-05-18T03:09:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments on competitive programming, mathematical reasoning, scientific question answering, and tool-use benchmarks show that MOPD consistently improves over standard on-policy baselines. Further teacher-signal analysis shows that mixed success-failure contexts better align teacher scores with verifier rewards."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the student's local rollout group can be used to construct teacher signals that are both more informative and better aligned with external verifier rewards without introducing new biases from the peer selection process."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MOPD improves on-policy distillation for LLMs by using peer successes for positive patterns and failures for negative examples to create more informative teacher signals."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"By conditioning teacher signals on both successful and failed peer rollouts from the same prompt, multi-rollout on-policy distillation supplies denser and better-aligned supervision than single-rollout baselines."}],"snapshot_sha256":"835acb4c23c54f65c071ff687f5c042b1cb909c0963cfc043d84b681c660cbe4"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1587235cc0af9ff5c49dfac8cf111a9736b7616a5a9a92ffc4dcffb84f635e3d"},"paper":{"abstract_excerpt":"Large language models are often post-trained with sparse verifier rewards, which indicate whether a sampled trajectory succeeds but provide limited guidance about where reasoning succeeds or fails. On-policy distillation (OPD) offers denser token-level supervision by training on student-generated trajectories, yet existing methods typically distill each rollout independently and ignore the other attempts sampled for the same prompt. We introduce Multi-Rollout On-Policy Distillation (MOPD), a peer-conditioned distillation framework that uses the student's local rollout group to construct more i","authors_text":"Chen Henry Wu, Gaurav Mittal, Haixin Wang, Matt Fredrikson, Ruowang Zhang, Weichen Yu, Xiaomin Li, Xiaoze Liu, Yinyi Luo, Yizhou Zhao, Yu Hu","cross_cats":["cs.AI"],"headline":"By conditioning teacher signals on both successful and failed peer rollouts from the same prompt, multi-rollout on-policy distillation supplies denser and better-aligned supervision than single-rollout baselines.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:44Z","title":"Multi-Rollout On-Policy Distillation via Peer Successes and Failures"},"references":{"count":79,"internal_anchors":23,"resolved_work":79,"sample":[{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null},{"cited_arxiv_id":"1503.02531","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Distilling the Knowledge in a Neural Network","work_id":"d927ab1f-17b8-4002-9d09-c3d55764fbad","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Rush , title =","work_id":"713931d3-c34c-4aee-980b-05473efb3a27","year":2016},{"cited_arxiv_id":"2402.13116","doi":"","is_internal_anchor":true,"ref_index":4,"title":"A Survey on Knowledge Distillation of Large Language Models","work_id":"b1dcdadf-875e-4695-8fcf-0907c69302f3","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"arXiv preprint arXiv:2305.15717 , year=","work_id":"f843d86a-7fd6-4b0d-bf8b-f1ad3fbc3f04","year":null}],"snapshot_sha256":"7dbb31d7c912d229c9e781048f221011aa479c2021626c6b9b313266c048a976"},"source":{"id":"2605.12652","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T21:25:53.077522Z","id":"82901114-83a2-4a1e-ac96-196e2e5835aa","model_set":{"reader":"grok-4.3"},"one_line_summary":"MOPD improves on-policy distillation for LLMs by using peer successes for positive patterns and failures for negative examples to create more informative teacher signals.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"By conditioning teacher signals on both successful and failed peer rollouts from the same prompt, multi-rollout on-policy distillation supplies denser and better-aligned supervision than single-rollout baselines.","strongest_claim":"Experiments on competitive programming, mathematical reasoning, scientific question answering, and tool-use benchmarks show that MOPD consistently improves over standard on-policy baselines. Further teacher-signal analysis shows that mixed success-failure contexts better align teacher scores with verifier rewards.","weakest_assumption":"That the student's local rollout group can be used to construct teacher signals that are both more informative and better aligned with external verifier rewards without introducing new biases from the peer selection process."}},"verdict_id":"82901114-83a2-4a1e-ac96-196e2e5835aa"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:dc1561d4de2f4fcbf917b7eb6580b900e895c028d08777159e473b552fd1084b","target":"record","created_at":"2026-05-18T03:09:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"593fa5067f258c404222fd96a88c5f1b645eb03f85ac6ce256a6dbdd8e7b3fcc","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:44Z","title_canon_sha256":"e19006a81b64cfbc13bb3059452dcfc0320e92d0a44498802fc5f562d060bf42"},"schema_version":"1.0","source":{"id":"2605.12652","kind":"arxiv","version":1}},"canonical_sha256":"e6d8ae6f853574f0f87686bb8e57d45d14eaa6d5928503e5034afa5da6273372","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e6d8ae6f853574f0f87686bb8e57d45d14eaa6d5928503e5034afa5da6273372","first_computed_at":"2026-05-18T03:09:50.759655Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:50.759655Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"xGUYLES4PBvu29hKK5j7u+XI6eeuZ4sk54uJw3CNR8r+2JiWJPeW7M9wmVivus0VUNqzxO5wMRM9Boba3SZCCQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:50.760718Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12652","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:dc1561d4de2f4fcbf917b7eb6580b900e895c028d08777159e473b552fd1084b","sha256:141198e40ddd0655c8b424f96bf163e159dac2437dfb672000cf64b891418c3f"],"state_sha256":"c9d3285e7ec3c4bb4d4639c4dac49859ac22c32e3755e0fabfe32aa3c77662cf"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Xnvwqy3Lf6v5J2NFUjBCIMqANAwFkuGxoKoC1NurgJUvFIKCpW1QBtY4FMax+fiCszuZYBp0eo+N8nowtuXeCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T15:27:49.878541Z","bundle_sha256":"832aa9c7dace38ef3044f9087e3ba36bc1ac0da98294cc8f7f237f14a42a2369"}}