{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:MMV3S43NX2NZS5FKPHO33KGNPX","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"02836581b64f996450743dd106f6e4e88786e5671e9d6a5c5158087a906901cd","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-12T17:49:55Z","title_canon_sha256":"54ce252e6e4a74379d16b07187cb77f39b51b53b770d20a1668851db184f1cc0"},"schema_version":"1.0","source":{"id":"2506.10947","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2506.10947","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2506.10947v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.10947","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"MMV3S43NX2NZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MMV3S43NX2NZS5FK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MMV3S43N","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c3f4186091bef413a7ef20068d47e65ddb9a3baa98bf1170ec9a78e5dcfe00cd","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"RLVR training with GRPO improves MATH-500 performance for Qwen2.5-Math-7B by 21.4 percentage points using randomly assigned rewards, nearly matching the 29.1-point gain from ground-truth rewards."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the performance gains with spurious rewards are primarily driven by the clipping bias in GRPO amplifying specific pretraining behaviors, rather than other unaccounted factors in the training process or model-specific quirks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Spurious rewards in RLVR can produce large gains in mathematical reasoning for certain language models via GRPO's clipping bias amplifying pretraining behaviors like code reasoning."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Reinforcement learning with verifiable rewards improves math performance in some models even when rewards are random or spurious."}],"snapshot_sha256":"a1313d28b42a591a874dbe65e2af1e7ae3d9a574f6d16be6d26ee8f7585da365"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"f473b6a76c0669e9d4301fc756ee7bd8f3aa0d2e5b7e7562c63260df054b38eb"},"paper":{"abstract_excerpt":"We show that reinforcement learning with verifiable rewards (RLVR) can elicit strong mathematical reasoning in certain language models even with spurious rewards that have little, no, or even negative correlation with the correct answer. For example, RLVR training with GRPO improves MATH-500 performance for Qwen2.5-Math-7B by 21.4 percentage points using randomly assigned rewards, nearly matching the 29.1-point gain from ground-truth rewards. To explain this counterintuitive observation, we show that GRPO exhibits a clipping bias from the clip term, which can amplify high-prior behaviors learn","authors_text":"Hannaneh Hajishirzi, Luke Zettlemoyer, Nathan Lambert, Pang Wei Koh, Ranjay Krishna, Rui Xin, Rulin Shao, Scott Geng, Sewon Min, Sewoong Oh, Shuyue Stella Li, Simon Shaolei Du, Yiping Wang, Yulia Tsvetkov","cross_cats":["cs.LG"],"headline":"Reinforcement learning with verifiable rewards improves math performance in some models even when rewards are random or spurious.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-12T17:49:55Z","title":"Spurious Rewards: Rethinking Training Signals in RLVR"},"references":{"count":21,"internal_anchors":3,"resolved_work":21,"sample":[{"cited_arxiv_id":"","doi":"10.1038/s41586-025-09422-z","is_internal_anchor":false,"ref_index":1,"title":"Nature645(8081), 633–638 (2025) https://doi.org/10.1038/s41586-025-09422-z","work_id":"9835b482-5032-4135-93dd-82a066677569","year":2025},{"cited_arxiv_id":"2501.00656","doi":"","is_internal_anchor":true,"ref_index":2,"title":"2 OLMo 2 Furious","work_id":"9ef0dc2b-fdfe-4f14-b235-ef7556dc709a","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"CoRR , volume =","work_id":"471b6786-7627-4021-a6b3-7f5cd8c83643","year":2022},{"cited_arxiv_id":"","doi":"10.18653/v1/2025.acl-industry","is_internal_anchor":false,"ref_index":4,"title":"ISBN 9 Coverage, Not Averages Semantic Stratification for Trustworthy Retrieval Evaluation 979-8-89176-288-6","work_id":"74d89fb3-6798-4148-af6b-9764a4f36db8","year":2025},{"cited_arxiv_id":"2502.14768","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Logic-RL: Unleashing LLM Reasoning with Rule-Based Reinforcement Learning","work_id":"b1f42628-30b7-4593-80ff-813523035c23","year":2025}],"snapshot_sha256":"35f2c6e607b49f7b17ab9b655c305739f3402da7da0cdcf67275a3771ce4460e"},"source":{"id":"2506.10947","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T13:33:45.992784Z","id":"26070616-b572-4562-9a0b-ed7124ac98f8","model_set":{"reader":"grok-4.3"},"one_line_summary":"Spurious rewards in RLVR can produce large gains in mathematical reasoning for certain language models via GRPO's clipping bias amplifying pretraining behaviors like code reasoning.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Reinforcement learning with verifiable rewards improves math performance in some models even when rewards are random or spurious.","strongest_claim":"RLVR training with GRPO improves MATH-500 performance for Qwen2.5-Math-7B by 21.4 percentage points using randomly assigned rewards, nearly matching the 29.1-point gain from ground-truth rewards.","weakest_assumption":"The assumption that the performance gains with spurious rewards are primarily driven by the clipping bias in GRPO amplifying specific pretraining behaviors, rather than other unaccounted factors in the training process or model-specific quirks."}},"verdict_id":"26070616-b572-4562-9a0b-ed7124ac98f8"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c4a51f12df841d1ac7ee338e21788e9ab3924b1c377610372730e16ba1eb46dc","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"02836581b64f996450743dd106f6e4e88786e5671e9d6a5c5158087a906901cd","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-12T17:49:55Z","title_canon_sha256":"54ce252e6e4a74379d16b07187cb77f39b51b53b770d20a1668851db184f1cc0"},"schema_version":"1.0","source":{"id":"2506.10947","kind":"arxiv","version":2}},"canonical_sha256":"632bb9736dbe9b9974aa79ddbda8cd7dc09d9c242dc0470ab304df9b8ecd32c5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"632bb9736dbe9b9974aa79ddbda8cd7dc09d9c242dc0470ab304df9b8ecd32c5","first_computed_at":"2026-05-17T23:38:47.721148Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.721148Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"QGnyQ9Ly3wzkf3lnsqOQKtRfGm7rUsWkmdvgyZDl+XkDjUGSjjC43g3+cbNkb+cEozXAGFcR13jhU/XVSpsBBw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.721631Z","signed_message":"canonical_sha256_bytes"},"source_id":"2506.10947","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c4a51f12df841d1ac7ee338e21788e9ab3924b1c377610372730e16ba1eb46dc","sha256:c3f4186091bef413a7ef20068d47e65ddb9a3baa98bf1170ec9a78e5dcfe00cd"],"state_sha256":"3a37fce8304daf256afe3b22a9269f784d68787bcd2a7db24f8faa0d541a0938"}