{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:IVEMD2RMDPFM7CZY6Z2MNVVYDL","short_pith_number":"pith:IVEMD2RM","schema_version":"1.0","canonical_sha256":"4548c1ea2c1bcacf8b38f674c6d6b81ae7c1f5c95de198135a1baf56c09bf5a9","source":{"kind":"arxiv","id":"2509.21154","version":4},"attestation_state":"computed","paper":{"title":"GRPO is Secretly a Process Reward Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Alexander Koller, Michael Sullivan","submitted_at":"2025-09-25T13:40:36Z","abstract_excerpt":"Process reward models (PRMs) allow for fine-grained credit assignment in reinforcement learning (RL), and seemingly contrast with outcome reward models (ORMs), which assign a single reward to an entire trajectory. However, we provide theoretical proof in this work that the Group Relative Policy Optimization (GRPO) RL algorithm equipped with an ORM is in fact equivalent to a PRM-aware RL objective equipped with a non-trivial, Monte-Carlo-based PRM (given mild assumptions). Leveraging the framework of GRPO-as-a-PRM, we identify a flaw in the GRPO objective that interacts with imbalanced process "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.21154","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-25T13:40:36Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"8ccdd1feaf668166a1a097e72641217affdce7de6a4405524820638bb5abce00","abstract_canon_sha256":"568cb2708829e3bd83c602f2ca9d8ffb72a97e94dbe41b88593c72e057b730cb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T02:05:36.366015Z","signature_b64":"YnnTrNTEW8IKS4i+5Z6E5w9JTyLo1GU4zUqRtZGcPCxqthrc5Mjvh26aLgLGnr2uY6v1DO3Cx1KgskX9y5oqAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4548c1ea2c1bcacf8b38f674c6d6b81ae7c1f5c95de198135a1baf56c09bf5a9","last_reissued_at":"2026-05-29T02:05:36.365588Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T02:05:36.365588Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GRPO is Secretly a Process Reward Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Alexander Koller, Michael Sullivan","submitted_at":"2025-09-25T13:40:36Z","abstract_excerpt":"Process reward models (PRMs) allow for fine-grained credit assignment in reinforcement learning (RL), and seemingly contrast with outcome reward models (ORMs), which assign a single reward to an entire trajectory. However, we provide theoretical proof in this work that the Group Relative Policy Optimization (GRPO) RL algorithm equipped with an ORM is in fact equivalent to a PRM-aware RL objective equipped with a non-trivial, Monte-Carlo-based PRM (given mild assumptions). Leveraging the framework of GRPO-as-a-PRM, we identify a flaw in the GRPO objective that interacts with imbalanced process "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.21154","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.21154/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.21154","created_at":"2026-05-29T02:05:36.365645+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.21154v4","created_at":"2026-05-29T02:05:36.365645+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.21154","created_at":"2026-05-29T02:05:36.365645+00:00"},{"alias_kind":"pith_short_12","alias_value":"IVEMD2RMDPFM","created_at":"2026-05-29T02:05:36.365645+00:00"},{"alias_kind":"pith_short_16","alias_value":"IVEMD2RMDPFM7CZY","created_at":"2026-05-29T02:05:36.365645+00:00"},{"alias_kind":"pith_short_8","alias_value":"IVEMD2RM","created_at":"2026-05-29T02:05:36.365645+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL","json":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL.json","graph_json":"https://pith.science/api/pith-number/IVEMD2RMDPFM7CZY6Z2MNVVYDL/graph.json","events_json":"https://pith.science/api/pith-number/IVEMD2RMDPFM7CZY6Z2MNVVYDL/events.json","paper":"https://pith.science/paper/IVEMD2RM"},"agent_actions":{"view_html":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL","download_json":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL.json","view_paper":"https://pith.science/paper/IVEMD2RM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.21154&json=true","fetch_graph":"https://pith.science/api/pith-number/IVEMD2RMDPFM7CZY6Z2MNVVYDL/graph.json","fetch_events":"https://pith.science/api/pith-number/IVEMD2RMDPFM7CZY6Z2MNVVYDL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL/action/storage_attestation","attest_author":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL/action/author_attestation","sign_citation":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL/action/citation_signature","submit_replication":"https://pith.science/pith/IVEMD2RMDPFM7CZY6Z2MNVVYDL/action/replication_record"}},"created_at":"2026-05-29T02:05:36.365645+00:00","updated_at":"2026-05-29T02:05:36.365645+00:00"}