{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:3L3U62UNUP5JUF672ALAUOS35I","short_pith_number":"pith:3L3U62UN","canonical_record":{"source":{"id":"2510.18924","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-21T10:14:49Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"bfc982ca64f0550efe86d2c8de8b13568cb804ae34495627529931e35cfdc532","abstract_canon_sha256":"f6384c670da0b486155d7c78f32a254eb6a1ed617f402ee2082f86134399b3e1"},"schema_version":"1.0"},"canonical_sha256":"daf74f6a8da3fa9a17dfd0160a3a5bea20d7788bdc74ff7eab53495b1e1f326b","source":{"kind":"arxiv","id":"2510.18924","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.18924","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"arxiv_version","alias_value":"2510.18924v3","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.18924","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"pith_short_12","alias_value":"3L3U62UNUP5J","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"pith_short_16","alias_value":"3L3U62UNUP5JUF67","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"pith_short_8","alias_value":"3L3U62UN","created_at":"2026-05-20T01:06:06Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:3L3U62UNUP5JUF672ALAUOS35I","target":"record","payload":{"canonical_record":{"source":{"id":"2510.18924","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-21T10:14:49Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"bfc982ca64f0550efe86d2c8de8b13568cb804ae34495627529931e35cfdc532","abstract_canon_sha256":"f6384c670da0b486155d7c78f32a254eb6a1ed617f402ee2082f86134399b3e1"},"schema_version":"1.0"},"canonical_sha256":"daf74f6a8da3fa9a17dfd0160a3a5bea20d7788bdc74ff7eab53495b1e1f326b","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:06:06.340432Z","signature_b64":"e9c3WnWn3BwXbCBCIGpuwtmfnYur6YVCs9k8BnKDJSOz2CrH8QUs8NtwxPzgu4kE8QQfDe9UHu0U3jxZTBn2Cg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"daf74f6a8da3fa9a17dfd0160a3a5bea20d7788bdc74ff7eab53495b1e1f326b","last_reissued_at":"2026-05-20T01:06:06.339334Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:06:06.339334Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2510.18924","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:06:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4hDXxlBPCmw5/Lp4szNjda9tSIoAWMvSYFor225D3+wTw5RrTzB+sDa0jwCo4KAkvNQ8jrWRl5R47ZNPbNvfCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T15:00:35.888258Z"},"content_sha256":"1f49961d984780e36b38c18590e7db9aa9e5d75fa83c402f5f6e469f913a84b6","schema_version":"1.0","event_id":"sha256:1f49961d984780e36b38c18590e7db9aa9e5d75fa83c402f5f6e469f913a84b6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:3L3U62UNUP5JUF672ALAUOS35I","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Noise-corrected GRPO: From Noisy Rewards to Unbiased Gradients","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Fathinah Asma Izzati, Mohamed El Amine Seddik, Omar El Mansouri, Salem Lahlou","submitted_at":"2025-10-21T10:14:49Z","abstract_excerpt":"Reinforcement learning from human feedback (RLHF) or verifiable rewards (RLVR), the standard paradigm for aligning LLMs or building recent SOTA reasoning models, is highly sensitive to noise from inconsistent or erroneous rewards. Yet, the interaction between such noise and widely used group-based policy optimization methods remains underexplored. We introduce a noise-robust Group Relative Policy Optimization (GRPO) and Done Right GRPO (Dr.GRPO) framework that explicitly models reward corruption as Bernoulli noise. Our method applies noise correction after estimating reward flip probabilities "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.18924","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.18924/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:06:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+8bABPsluAWrGpPtd5yPJUpToR+qNr73R+Vu6BmiTtFqg5a/WpUo4SyqRpGVg2MtvmmjI6et2ZA4vqzZAzpnAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T15:00:35.888660Z"},"content_sha256":"50b356900e46408bb9ec058d2975e41cc4342a7362d4f9a73f242ffd52ecdc91","schema_version":"1.0","event_id":"sha256:50b356900e46408bb9ec058d2975e41cc4342a7362d4f9a73f242ffd52ecdc91"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/3L3U62UNUP5JUF672ALAUOS35I/bundle.json","state_url":"https://pith.science/pith/3L3U62UNUP5JUF672ALAUOS35I/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/3L3U62UNUP5JUF672ALAUOS35I/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T15:00:35Z","links":{"resolver":"https://pith.science/pith/3L3U62UNUP5JUF672ALAUOS35I","bundle":"https://pith.science/pith/3L3U62UNUP5JUF672ALAUOS35I/bundle.json","state":"https://pith.science/pith/3L3U62UNUP5JUF672ALAUOS35I/state.json","well_known_bundle":"https://pith.science/.well-known/pith/3L3U62UNUP5JUF672ALAUOS35I/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:3L3U62UNUP5JUF672ALAUOS35I","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f6384c670da0b486155d7c78f32a254eb6a1ed617f402ee2082f86134399b3e1","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-21T10:14:49Z","title_canon_sha256":"bfc982ca64f0550efe86d2c8de8b13568cb804ae34495627529931e35cfdc532"},"schema_version":"1.0","source":{"id":"2510.18924","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.18924","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"arxiv_version","alias_value":"2510.18924v3","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.18924","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"pith_short_12","alias_value":"3L3U62UNUP5J","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"pith_short_16","alias_value":"3L3U62UNUP5JUF67","created_at":"2026-05-20T01:06:06Z"},{"alias_kind":"pith_short_8","alias_value":"3L3U62UN","created_at":"2026-05-20T01:06:06Z"}],"graph_snapshots":[{"event_id":"sha256:50b356900e46408bb9ec058d2975e41cc4342a7362d4f9a73f242ffd52ecdc91","target":"graph","created_at":"2026-05-20T01:06:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2510.18924/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Reinforcement learning from human feedback (RLHF) or verifiable rewards (RLVR), the standard paradigm for aligning LLMs or building recent SOTA reasoning models, is highly sensitive to noise from inconsistent or erroneous rewards. Yet, the interaction between such noise and widely used group-based policy optimization methods remains underexplored. We introduce a noise-robust Group Relative Policy Optimization (GRPO) and Done Right GRPO (Dr.GRPO) framework that explicitly models reward corruption as Bernoulli noise. Our method applies noise correction after estimating reward flip probabilities ","authors_text":"Fathinah Asma Izzati, Mohamed El Amine Seddik, Omar El Mansouri, Salem Lahlou","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-21T10:14:49Z","title":"Noise-corrected GRPO: From Noisy Rewards to Unbiased Gradients"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.18924","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1f49961d984780e36b38c18590e7db9aa9e5d75fa83c402f5f6e469f913a84b6","target":"record","created_at":"2026-05-20T01:06:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f6384c670da0b486155d7c78f32a254eb6a1ed617f402ee2082f86134399b3e1","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-21T10:14:49Z","title_canon_sha256":"bfc982ca64f0550efe86d2c8de8b13568cb804ae34495627529931e35cfdc532"},"schema_version":"1.0","source":{"id":"2510.18924","kind":"arxiv","version":3}},"canonical_sha256":"daf74f6a8da3fa9a17dfd0160a3a5bea20d7788bdc74ff7eab53495b1e1f326b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"daf74f6a8da3fa9a17dfd0160a3a5bea20d7788bdc74ff7eab53495b1e1f326b","first_computed_at":"2026-05-20T01:06:06.339334Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:06:06.339334Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"e9c3WnWn3BwXbCBCIGpuwtmfnYur6YVCs9k8BnKDJSOz2CrH8QUs8NtwxPzgu4kE8QQfDe9UHu0U3jxZTBn2Cg==","signature_status":"signed_v1","signed_at":"2026-05-20T01:06:06.340432Z","signed_message":"canonical_sha256_bytes"},"source_id":"2510.18924","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1f49961d984780e36b38c18590e7db9aa9e5d75fa83c402f5f6e469f913a84b6","sha256:50b356900e46408bb9ec058d2975e41cc4342a7362d4f9a73f242ffd52ecdc91"],"state_sha256":"3aaebdae653e583a8a7a3fc3879bd643bfb32f49d17273c5da1efff1310cfbd9"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"T4FJAQ4JnWpEvrh70YBtMq904tGkZYEIwj4UX/zdg7g7IWwWBNeTkkIupF8LTPcUz/Bkx/TBLo+1TDlqJGkFDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T15:00:35.890735Z","bundle_sha256":"9a342b98f9d62eb2b501571ddfa5b70c97f7447da38913682fadfa6ca585d4fd"}}