{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:U4EW5DVJEM5WGBYQ4AURKGTS7S","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"dd19ed0bd33d4fc1b4756639e11976de26fc658fe5ec0acc7421397c757bb42d","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-08-05T04:05:15Z","title_canon_sha256":"a9ea9560578f27f9b87d0c921e54a5bb7f5d21c6673c449982c7603249463afb"},"schema_version":"1.0","source":{"id":"2508.03058","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2508.03058","created_at":"2026-06-23T02:12:32Z"},{"alias_kind":"arxiv_version","alias_value":"2508.03058v2","created_at":"2026-06-23T02:12:32Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2508.03058","created_at":"2026-06-23T02:12:32Z"},{"alias_kind":"pith_short_12","alias_value":"U4EW5DVJEM5W","created_at":"2026-06-23T02:12:32Z"},{"alias_kind":"pith_short_16","alias_value":"U4EW5DVJEM5WGBYQ","created_at":"2026-06-23T02:12:32Z"},{"alias_kind":"pith_short_8","alias_value":"U4EW5DVJ","created_at":"2026-06-23T02:12:32Z"}],"graph_snapshots":[{"event_id":"sha256:05acea70fa35b4e35adc18e442687574e2364d477bcfb32fd4091166e9165a89","target":"graph","created_at":"2026-06-23T02:12:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2508.03058/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement Learning (RL) in real-world environments often suffers from ambiguous or incomplete reward supervision, which undermines policy stability and generalization. Such noise may cause models to ignore key information or even collapse in advantage estimation. We find that a strong value model is essential for absorbing unstable signals and producing reliable advantages, offering denser and more robust supervision than the reward model. To better optimize noisy supervision, we propose VRPO, a framework that enhances value modeling for robust RL in LLM post-training. VRPO integrates (1) ","authors_text":"Caishuang Huang, Chenhao Huang, Dingwei Zhu, Enyu Zhou, Guoqiang Zhang, Jiazheng Zhang, Junjie Ye, Mingxu Chai, Ming Zhang, Qi Zhang, Senjie Jin, Shihan Dou, Tao Gui, Xipeng Qiu, Xuanjing Huang, Yuhui Wang, Yunke Zhang, Yuran Wang, Zhiheng Xi","cross_cats":["cs.AI","cs.CL"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-08-05T04:05:15Z","title":"VRPO: Rethinking Value Modeling for Robust RL under Noisy Supervision in LLM Post-Training"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2508.03058","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:deb60296ca8db9cb43450d535258cbca3726b1efc2b28de49af84563b397b8b7","target":"record","created_at":"2026-06-23T02:12:32Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"dd19ed0bd33d4fc1b4756639e11976de26fc658fe5ec0acc7421397c757bb42d","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-08-05T04:05:15Z","title_canon_sha256":"a9ea9560578f27f9b87d0c921e54a5bb7f5d21c6673c449982c7603249463afb"},"schema_version":"1.0","source":{"id":"2508.03058","kind":"arxiv","version":2}},"canonical_sha256":"a7096e8ea9233b630710e029151a72fc90d9ecb2c266432171f4916cb1be3657","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a7096e8ea9233b630710e029151a72fc90d9ecb2c266432171f4916cb1be3657","first_computed_at":"2026-06-23T02:12:32.632478Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-23T02:12:32.632478Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"1PUHhQkhpsmyi/LYfS+HAG18HJKNL8wF7GIXUS4jCVrzRxkZPDa42/i2iB6tu3wTGvCBcIUjcAiU2GeJ9wq2Aw==","signature_status":"signed_v1","signed_at":"2026-06-23T02:12:32.632956Z","signed_message":"canonical_sha256_bytes"},"source_id":"2508.03058","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:deb60296ca8db9cb43450d535258cbca3726b1efc2b28de49af84563b397b8b7","sha256:05acea70fa35b4e35adc18e442687574e2364d477bcfb32fd4091166e9165a89"],"state_sha256":"70e6011b7355ad976ca0baa7f250e055c1e02a567d714217cbbc28b17c87bbf3"}