{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:WWMAWD23U52EPIRYEKDIIC5LP4","short_pith_number":"pith:WWMAWD23","schema_version":"1.0","canonical_sha256":"b5980b0f5ba77447a2382286840bab7f379c483cb35359fa8793d79721a21e08","source":{"kind":"arxiv","id":"2606.18216","version":1},"attestation_state":"computed","paper":{"title":"Zone of Proximal Policy Optimization: Teacher in Prompts, Not Gradients","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Andrew Tao, Byung-Kwan Lee, Karan Sapra, Minki Kang, Pavlo Molchanov, Ryo Hachiuma, Saurav Muralidharan, Shizhe Diao, Ximing Lu, Yejin Choi, Yu-Chiang Frank Wang","submitted_at":"2026-06-16T17:46:02Z","abstract_excerpt":"Knowledge distillation transfers a teacher's competence to a small student but is brittle in the small-student regime: forcing the student to imitate logits from a much larger teacher concentrates it on the teacher's sharpest modes, hurting generalization on benchmark families beyond the training corpus. Reinforcement learning (RL) avoids logit imitation by training on the student's own rollouts. However, on questions where every rollout fails-yielding zero advantage and being silently discarded-injecting a stronger teacher's response into the policy gradient breaks the on-policy assumption an"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.18216","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-06-16T17:46:02Z","cross_cats_sorted":[],"title_canon_sha256":"ca6f94ac4a6da53b2e1443f3ac74595aff4413b866611c57768179ea9bad87f4","abstract_canon_sha256":"ec471f3dacc19413acec1b8eff467b63b739050d38e3556583d6e4a2e51f0ab5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:10:51.346498Z","signature_b64":"up0mbcS4FMJPsX86x7hT3XjALSojJVQYsHvnBhv7rlWE9UdcebWUi3Kfmkj2aryCF/KxtoZfJ/4C2CFD8h5wAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b5980b0f5ba77447a2382286840bab7f379c483cb35359fa8793d79721a21e08","last_reissued_at":"2026-06-19T16:10:51.346080Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:10:51.346080Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Zone of Proximal Policy Optimization: Teacher in Prompts, Not Gradients","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Andrew Tao, Byung-Kwan Lee, Karan Sapra, Minki Kang, Pavlo Molchanov, Ryo Hachiuma, Saurav Muralidharan, Shizhe Diao, Ximing Lu, Yejin Choi, Yu-Chiang Frank Wang","submitted_at":"2026-06-16T17:46:02Z","abstract_excerpt":"Knowledge distillation transfers a teacher's competence to a small student but is brittle in the small-student regime: forcing the student to imitate logits from a much larger teacher concentrates it on the teacher's sharpest modes, hurting generalization on benchmark families beyond the training corpus. Reinforcement learning (RL) avoids logit imitation by training on the student's own rollouts. However, on questions where every rollout fails-yielding zero advantage and being silently discarded-injecting a stronger teacher's response into the policy gradient breaks the on-policy assumption an"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.18216","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.18216/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.18216","created_at":"2026-06-19T16:10:51.346141+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.18216v1","created_at":"2026-06-19T16:10:51.346141+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.18216","created_at":"2026-06-19T16:10:51.346141+00:00"},{"alias_kind":"pith_short_12","alias_value":"WWMAWD23U52E","created_at":"2026-06-19T16:10:51.346141+00:00"},{"alias_kind":"pith_short_16","alias_value":"WWMAWD23U52EPIRY","created_at":"2026-06-19T16:10:51.346141+00:00"},{"alias_kind":"pith_short_8","alias_value":"WWMAWD23","created_at":"2026-06-19T16:10:51.346141+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4","json":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4.json","graph_json":"https://pith.science/api/pith-number/WWMAWD23U52EPIRYEKDIIC5LP4/graph.json","events_json":"https://pith.science/api/pith-number/WWMAWD23U52EPIRYEKDIIC5LP4/events.json","paper":"https://pith.science/paper/WWMAWD23"},"agent_actions":{"view_html":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4","download_json":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4.json","view_paper":"https://pith.science/paper/WWMAWD23","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.18216&json=true","fetch_graph":"https://pith.science/api/pith-number/WWMAWD23U52EPIRYEKDIIC5LP4/graph.json","fetch_events":"https://pith.science/api/pith-number/WWMAWD23U52EPIRYEKDIIC5LP4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4/action/storage_attestation","attest_author":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4/action/author_attestation","sign_citation":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4/action/citation_signature","submit_replication":"https://pith.science/pith/WWMAWD23U52EPIRYEKDIIC5LP4/action/replication_record"}},"created_at":"2026-06-19T16:10:51.346141+00:00","updated_at":"2026-06-19T16:10:51.346141+00:00"}