{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:JP5WSH53CFZH4SWKCJ2HNJMQLZ","short_pith_number":"pith:JP5WSH53","canonical_record":{"source":{"id":"2606.00755","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"c1e8329d5c937f105e854c06a801599d9ab6c771d710fa625bd909ee721a331d","abstract_canon_sha256":"6b5a8ba971451034588fa7ac80cbce571fffe325e7d4b4b45c0bd76a0a061bb2"},"schema_version":"1.0"},"canonical_sha256":"4bfb691fbb11727e4aca127476a5905e4065ca609fe57102359e462c2ad10b21","source":{"kind":"arxiv","id":"2606.00755","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.00755","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"arxiv_version","alias_value":"2606.00755v1","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.00755","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"pith_short_12","alias_value":"JP5WSH53CFZH","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"pith_short_16","alias_value":"JP5WSH53CFZH4SWK","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"pith_short_8","alias_value":"JP5WSH53","created_at":"2026-06-02T01:04:04Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:JP5WSH53CFZH4SWKCJ2HNJMQLZ","target":"record","payload":{"canonical_record":{"source":{"id":"2606.00755","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"c1e8329d5c937f105e854c06a801599d9ab6c771d710fa625bd909ee721a331d","abstract_canon_sha256":"6b5a8ba971451034588fa7ac80cbce571fffe325e7d4b4b45c0bd76a0a061bb2"},"schema_version":"1.0"},"canonical_sha256":"4bfb691fbb11727e4aca127476a5905e4065ca609fe57102359e462c2ad10b21","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T01:04:04.862186Z","signature_b64":"Yo7b8u34EKDCqgQivozTlaGKi7u6n+K5alPXhk6zEEv9/5+AJxLcNnbRmbnZimRJ8aA7H/fZ9TX3vfKXLd++Cg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4bfb691fbb11727e4aca127476a5905e4065ca609fe57102359e462c2ad10b21","last_reissued_at":"2026-06-02T01:04:04.861822Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T01:04:04.861822Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2606.00755","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T01:04:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kIuECdKAM4kIzJ0N43uJSQ6v6qwBbseLtAfXYXFSEUGi4QgZIGR3DajboJncuQyl91LPW90dHyYZG5SapjVACg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-06T20:05:00.308191Z"},"content_sha256":"a02955e8181936bb5cd7d7e01963cc329b3a792e31faec965de8c5714631b38c","schema_version":"1.0","event_id":"sha256:a02955e8181936bb5cd7d7e01963cc329b3a792e31faec965de8c5714631b38c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:JP5WSH53CFZH4SWKCJ2HNJMQLZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Internalize the Temperature: On-Policy Self-Distillation as Policy Reheater for Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Jiachen Yu, Jie Wu, Junjie Wang, Shaoning Sun, Xuewei Yang, Yujiu Yang","submitted_at":"2026-05-30T14:44:57Z","abstract_excerpt":"Reinforcement learning from verifiable rewards improves the reasoning ability of large language models, but often suffers from entropy collapse, in which increasingly concentrated policies reduce rollout diversity and useful learning signals. Existing remedies either constrain the RL objective (e.g., entropy regularization) or adjust sampling temperature during rollout collection, but these interventions remain external to the model parameters. We propose Temperature-Scaled On-Policy Self-Distillation (TS-OPSD), a lightweight policy reheating method that internalizes the exploratory effect of "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.00755","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.00755/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T01:04:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nouSRa5FepZl/pN0FIIty4rOV1FUupv+Pkcc0tFaxKa3dnKik5uvnjgg179VVez4JVjNePBMqgTUFzONGF/IBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-06T20:05:00.308771Z"},"content_sha256":"141b95574722671cb4c19a10c8bfe57b5c4e01038320f7afe127f379280a2791","schema_version":"1.0","event_id":"sha256:141b95574722671cb4c19a10c8bfe57b5c4e01038320f7afe127f379280a2791"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ/bundle.json","state_url":"https://pith.science/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-06T20:05:00Z","links":{"resolver":"https://pith.science/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ","bundle":"https://pith.science/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ/bundle.json","state":"https://pith.science/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/JP5WSH53CFZH4SWKCJ2HNJMQLZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:JP5WSH53CFZH4SWKCJ2HNJMQLZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6b5a8ba971451034588fa7ac80cbce571fffe325e7d4b4b45c0bd76a0a061bb2","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57Z","title_canon_sha256":"c1e8329d5c937f105e854c06a801599d9ab6c771d710fa625bd909ee721a331d"},"schema_version":"1.0","source":{"id":"2606.00755","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.00755","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"arxiv_version","alias_value":"2606.00755v1","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.00755","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"pith_short_12","alias_value":"JP5WSH53CFZH","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"pith_short_16","alias_value":"JP5WSH53CFZH4SWK","created_at":"2026-06-02T01:04:04Z"},{"alias_kind":"pith_short_8","alias_value":"JP5WSH53","created_at":"2026-06-02T01:04:04Z"}],"graph_snapshots":[{"event_id":"sha256:141b95574722671cb4c19a10c8bfe57b5c4e01038320f7afe127f379280a2791","target":"graph","created_at":"2026-06-02T01:04:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.00755/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement learning from verifiable rewards improves the reasoning ability of large language models, but often suffers from entropy collapse, in which increasingly concentrated policies reduce rollout diversity and useful learning signals. Existing remedies either constrain the RL objective (e.g., entropy regularization) or adjust sampling temperature during rollout collection, but these interventions remain external to the model parameters. We propose Temperature-Scaled On-Policy Self-Distillation (TS-OPSD), a lightweight policy reheating method that internalizes the exploratory effect of ","authors_text":"Jiachen Yu, Jie Wu, Junjie Wang, Shaoning Sun, Xuewei Yang, Yujiu Yang","cross_cats":["cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57Z","title":"Internalize the Temperature: On-Policy Self-Distillation as Policy Reheater for Reinforcement Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.00755","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a02955e8181936bb5cd7d7e01963cc329b3a792e31faec965de8c5714631b38c","target":"record","created_at":"2026-06-02T01:04:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6b5a8ba971451034588fa7ac80cbce571fffe325e7d4b4b45c0bd76a0a061bb2","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57Z","title_canon_sha256":"c1e8329d5c937f105e854c06a801599d9ab6c771d710fa625bd909ee721a331d"},"schema_version":"1.0","source":{"id":"2606.00755","kind":"arxiv","version":1}},"canonical_sha256":"4bfb691fbb11727e4aca127476a5905e4065ca609fe57102359e462c2ad10b21","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4bfb691fbb11727e4aca127476a5905e4065ca609fe57102359e462c2ad10b21","first_computed_at":"2026-06-02T01:04:04.861822Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T01:04:04.861822Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Yo7b8u34EKDCqgQivozTlaGKi7u6n+K5alPXhk6zEEv9/5+AJxLcNnbRmbnZimRJ8aA7H/fZ9TX3vfKXLd++Cg==","signature_status":"signed_v1","signed_at":"2026-06-02T01:04:04.862186Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.00755","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a02955e8181936bb5cd7d7e01963cc329b3a792e31faec965de8c5714631b38c","sha256:141b95574722671cb4c19a10c8bfe57b5c4e01038320f7afe127f379280a2791"],"state_sha256":"2f13f6f5687b8a42ec256fd6cf011f8f9a47c827e5f6be0bed159a4a073c7fdb"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"zelNR9vtkqfwprbQilTLelyOMDrKlVNMtb9Oct6lb3jVjmzhWMZ81jXzGVsWP4WhQO5xEoOB5RyRaqX0CXrUAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-06T20:05:00.311456Z","bundle_sha256":"207619c17b66a79ad8a0de359f9c7449804ba9f2574b6459c219a97cf7305a64"}}