{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:QGWRCRHCKKPRP6GCVGWNZZVYPO","short_pith_number":"pith:QGWRCRHC","schema_version":"1.0","canonical_sha256":"81ad1144e2529f17f8c2a9acdce6b87b80e06d6aa80317414561fd711392174c","source":{"kind":"arxiv","id":"2509.03403","version":2},"attestation_state":"computed","paper":{"title":"Beyond Correctness: Harmonizing Process and Outcome Rewards through RL Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Anurag Beniwal, Chenlu Ye, Hao Chen, Jing Huang, Narayanan Sadagopan, Tong Zhang, Zhou Yu, Ziji Zhang","submitted_at":"2025-09-03T15:28:51Z","abstract_excerpt":"Reinforcement Learning with Verifiable Rewards (RLVR) improves final-answer accuracy on reasoning tasks, but it does not reliably improve reasoning quality. Because outcome rewards only assess final answers, they also reward spurious successes: flawed reasoning can still receive maximal reward when it accidentally reaches the correct outcome. This outcome reward hacking creates biased gradients, making current RLVR insufficient for learning faithful reasoning. Process Reward Models (PRMs) provide step-wise supervision, but directly optimizing PRMs or naively combining them with outcome rewards"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.03403","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-03T15:28:51Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"79857f4a8e61316b6ce91350f90e3b5c94f1f56f94fa260e9593d0cca26d72b5","abstract_canon_sha256":"751377cef871023c7c86fa96e27b4294b5835e5fb19bc99fecd0e80c814ad9f6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:06.243218Z","signature_b64":"IX6XDlA811dhw+0FCU8cDbcizWgedNUT0tPa38K3ylbxZ0BfQywECevlcJ1PpH6C/b5rQHBRiRiWXfiItGzdDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"81ad1144e2529f17f8c2a9acdce6b87b80e06d6aa80317414561fd711392174c","last_reissued_at":"2026-05-20T00:02:06.242384Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:06.242384Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Correctness: Harmonizing Process and Outcome Rewards through RL Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Anurag Beniwal, Chenlu Ye, Hao Chen, Jing Huang, Narayanan Sadagopan, Tong Zhang, Zhou Yu, Ziji Zhang","submitted_at":"2025-09-03T15:28:51Z","abstract_excerpt":"Reinforcement Learning with Verifiable Rewards (RLVR) improves final-answer accuracy on reasoning tasks, but it does not reliably improve reasoning quality. Because outcome rewards only assess final answers, they also reward spurious successes: flawed reasoning can still receive maximal reward when it accidentally reaches the correct outcome. This outcome reward hacking creates biased gradients, making current RLVR insufficient for learning faithful reasoning. Process Reward Models (PRMs) provide step-wise supervision, but directly optimizing PRMs or naively combining them with outcome rewards"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.03403","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.03403/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.03403","created_at":"2026-05-20T00:02:06.242519+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.03403v2","created_at":"2026-05-20T00:02:06.242519+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.03403","created_at":"2026-05-20T00:02:06.242519+00:00"},{"alias_kind":"pith_short_12","alias_value":"QGWRCRHCKKPR","created_at":"2026-05-20T00:02:06.242519+00:00"},{"alias_kind":"pith_short_16","alias_value":"QGWRCRHCKKPRP6GC","created_at":"2026-05-20T00:02:06.242519+00:00"},{"alias_kind":"pith_short_8","alias_value":"QGWRCRHC","created_at":"2026-05-20T00:02:06.242519+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2510.07794","citing_title":"HiPRAG: Hierarchical Process Rewards for Efficient Agentic Retrieval Augmented Generation","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02341","citing_title":"LLM Reasoning with Process Rewards for Outcome-Guided Steps","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03327","citing_title":"DGPO: Distribution Guided Policy Optimization for Fine Grained Credit Assignment","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12160","citing_title":"PubSwap: Public-Data Off-Policy Coordination for Federated RLVR","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03327","citing_title":"DGPO: Distribution Guided Policy Optimization for Fine Grained Credit Assignment","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13602","citing_title":"Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges","ref_index":140,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO","json":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO.json","graph_json":"https://pith.science/api/pith-number/QGWRCRHCKKPRP6GCVGWNZZVYPO/graph.json","events_json":"https://pith.science/api/pith-number/QGWRCRHCKKPRP6GCVGWNZZVYPO/events.json","paper":"https://pith.science/paper/QGWRCRHC"},"agent_actions":{"view_html":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO","download_json":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO.json","view_paper":"https://pith.science/paper/QGWRCRHC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.03403&json=true","fetch_graph":"https://pith.science/api/pith-number/QGWRCRHCKKPRP6GCVGWNZZVYPO/graph.json","fetch_events":"https://pith.science/api/pith-number/QGWRCRHCKKPRP6GCVGWNZZVYPO/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO/action/timestamp_anchor","attest_storage":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO/action/storage_attestation","attest_author":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO/action/author_attestation","sign_citation":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO/action/citation_signature","submit_replication":"https://pith.science/pith/QGWRCRHCKKPRP6GCVGWNZZVYPO/action/replication_record"}},"created_at":"2026-05-20T00:02:06.242519+00:00","updated_at":"2026-05-20T00:02:06.242519+00:00"}