{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:EMHCOMD2XZCIYW3CLHAGC32DRL","short_pith_number":"pith:EMHCOMD2","canonical_record":{"source":{"id":"2601.22478","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-30T02:43:29Z","cross_cats_sorted":[],"title_canon_sha256":"97d11adf4cf025a2b6cffb03864adc93d7e01fb7365007886fd97144780b2129","abstract_canon_sha256":"629044055f32005f5e10844de48f60721ced72c81ec6a8d238e98998d4435d5a"},"schema_version":"1.0"},"canonical_sha256":"230e27307abe448c5b6259c0616f438ace50ec874fdb19ca78e4f48447a49b66","source":{"kind":"arxiv","id":"2601.22478","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.22478","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"arxiv_version","alias_value":"2601.22478v4","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.22478","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_12","alias_value":"EMHCOMD2XZCI","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_16","alias_value":"EMHCOMD2XZCIYW3C","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_8","alias_value":"EMHCOMD2","created_at":"2026-05-20T00:03:03Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:EMHCOMD2XZCIYW3CLHAGC32DRL","target":"record","payload":{"canonical_record":{"source":{"id":"2601.22478","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-30T02:43:29Z","cross_cats_sorted":[],"title_canon_sha256":"97d11adf4cf025a2b6cffb03864adc93d7e01fb7365007886fd97144780b2129","abstract_canon_sha256":"629044055f32005f5e10844de48f60721ced72c81ec6a8d238e98998d4435d5a"},"schema_version":"1.0"},"canonical_sha256":"230e27307abe448c5b6259c0616f438ace50ec874fdb19ca78e4f48447a49b66","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:03.642328Z","signature_b64":"fTGdap+3zlTsZ2zqrUlCDCmwG7D0ohzKKKkKIwcIE6cZCCoOsZ9IeSZVxAWYPcQn2L1hJahMs00JEl5lExVmCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"230e27307abe448c5b6259c0616f438ace50ec874fdb19ca78e4f48447a49b66","last_reissued_at":"2026-05-20T00:03:03.641485Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:03.641485Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.22478","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:03:03Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"DAiTMdIwpTKxspkW8rp7eND+M58MiV3a2wYy2V3NEebEzn34jCNI4e272EeA4XYMQ79zrOsqB/MYAxhEf6iJBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T01:36:55.621762Z"},"content_sha256":"536a3e756cd439365dec9401d4d4769940d40d4e67910929218b0831ab7806bf","schema_version":"1.0","event_id":"sha256:536a3e756cd439365dec9401d4d4769940d40d4e67910929218b0831ab7806bf"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:EMHCOMD2XZCIYW3CLHAGC32DRL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Transformation-Augmented GRPO for Enhancing Exploration in Reasoning of Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Augmenting GRPO training with automatic rephrasings of each question improves pass rates on competition math and science benchmarks by enabling mixed rewards and diverse reasoning paths.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Chi-Heng Lin, Khiem Le, Nitesh V. Chawla, Phuc Nguyen, Shangqian Gao, Ting Hua, Youssef Mroueh","submitted_at":"2026-01-30T02:43:29Z","abstract_excerpt":"Group Relative Policy Optimization (GRPO) has become the dominant method for reinforcement learning with verifiable rewards in large language models, but it suffers from two critical limitations: gradient vanishing and diversity collapse. When training questions are too easy or too hard, all sampled responses receive identical rewards, yielding zero gradients. Meanwhile, the model tends to collapse its responses toward a single reasoning pattern rather than exploring diverse strategies. We propose Transformation-Augmented GRPO (TA-GRPO), a simple but effective method that addresses both issues"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"TA-GRPO consistently improves pass@k on competition-level benchmarks (AMC, OlympiadBench, AIME24, AIME25) and out-of-distribution benchmarks (Minerva, GPQA-Diamond). Notably, it improves the average pass@32 of Qwen3-1.7B and Qwen3-4B by 4.97 and 4.34 points, respectively, and matches the exploration quality of baselines trained on up to 2.5× more data.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The automatically generated rephrasings preserve semantic equivalence while meaningfully shifting the model's perceived difficulty, and that aligning importance ratios to the original question while computing advantages over the pooled set produces stable and beneficial policy updates without introducing bias or instability.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"TA-GRPO improves exploration in GRPO by rephrasing questions to mix rewards and reasoning paths, raising pass@32 scores by 4-5 points on math benchmarks while matching models trained on 2.5x more data.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Augmenting GRPO training with automatic rephrasings of each question improves pass rates on competition math and science benchmarks by enabling mixed rewards and diverse reasoning paths.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3ffb8fa9d7bb66134a9a6f0765ea5955adf48b82a0d3de57f7665dcf75a183fc"},"source":{"id":"2601.22478","kind":"arxiv","version":4},"verdict":{"id":"06326791-3016-4339-aaac-dcc9880fa27e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T09:38:57.936853Z","strongest_claim":"TA-GRPO consistently improves pass@k on competition-level benchmarks (AMC, OlympiadBench, AIME24, AIME25) and out-of-distribution benchmarks (Minerva, GPQA-Diamond). Notably, it improves the average pass@32 of Qwen3-1.7B and Qwen3-4B by 4.97 and 4.34 points, respectively, and matches the exploration quality of baselines trained on up to 2.5× more data.","one_line_summary":"TA-GRPO improves exploration in GRPO by rephrasing questions to mix rewards and reasoning paths, raising pass@32 scores by 4-5 points on math benchmarks while matching models trained on 2.5x more data.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The automatically generated rephrasings preserve semantic equivalence while meaningfully shifting the model's perceived difficulty, and that aligning importance ratios to the original question while computing advantages over the pooled set produces stable and beneficial policy updates without introducing bias or instability.","pith_extraction_headline":"Augmenting GRPO training with automatic rephrasings of each question improves pass rates on competition math and science benchmarks by enabling mixed rewards and diverse reasoning paths."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.22478/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"06326791-3016-4339-aaac-dcc9880fa27e"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:03:03Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tE6P5HQW9G+mGRj1DFzRVA1okbw2iEvtKFIPwQjQxQ6s7GE6pDxcKSmFn2m58hwbG0V66wRMX6oJux8mCI2cBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T01:36:55.622741Z"},"content_sha256":"2de496f6ec0d30a97faa0f815539783661cd84221f31bf49061d984054d81039","schema_version":"1.0","event_id":"sha256:2de496f6ec0d30a97faa0f815539783661cd84221f31bf49061d984054d81039"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/EMHCOMD2XZCIYW3CLHAGC32DRL/bundle.json","state_url":"https://pith.science/pith/EMHCOMD2XZCIYW3CLHAGC32DRL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/EMHCOMD2XZCIYW3CLHAGC32DRL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T01:36:55Z","links":{"resolver":"https://pith.science/pith/EMHCOMD2XZCIYW3CLHAGC32DRL","bundle":"https://pith.science/pith/EMHCOMD2XZCIYW3CLHAGC32DRL/bundle.json","state":"https://pith.science/pith/EMHCOMD2XZCIYW3CLHAGC32DRL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/EMHCOMD2XZCIYW3CLHAGC32DRL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:EMHCOMD2XZCIYW3CLHAGC32DRL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"629044055f32005f5e10844de48f60721ced72c81ec6a8d238e98998d4435d5a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-30T02:43:29Z","title_canon_sha256":"97d11adf4cf025a2b6cffb03864adc93d7e01fb7365007886fd97144780b2129"},"schema_version":"1.0","source":{"id":"2601.22478","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.22478","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"arxiv_version","alias_value":"2601.22478v4","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.22478","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_12","alias_value":"EMHCOMD2XZCI","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_16","alias_value":"EMHCOMD2XZCIYW3C","created_at":"2026-05-20T00:03:03Z"},{"alias_kind":"pith_short_8","alias_value":"EMHCOMD2","created_at":"2026-05-20T00:03:03Z"}],"graph_snapshots":[{"event_id":"sha256:2de496f6ec0d30a97faa0f815539783661cd84221f31bf49061d984054d81039","target":"graph","created_at":"2026-05-20T00:03:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"TA-GRPO consistently improves pass@k on competition-level benchmarks (AMC, OlympiadBench, AIME24, AIME25) and out-of-distribution benchmarks (Minerva, GPQA-Diamond). Notably, it improves the average pass@32 of Qwen3-1.7B and Qwen3-4B by 4.97 and 4.34 points, respectively, and matches the exploration quality of baselines trained on up to 2.5× more data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The automatically generated rephrasings preserve semantic equivalence while meaningfully shifting the model's perceived difficulty, and that aligning importance ratios to the original question while computing advantages over the pooled set produces stable and beneficial policy updates without introducing bias or instability."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"TA-GRPO improves exploration in GRPO by rephrasing questions to mix rewards and reasoning paths, raising pass@32 scores by 4-5 points on math benchmarks while matching models trained on 2.5x more data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Augmenting GRPO training with automatic rephrasings of each question improves pass rates on competition math and science benchmarks by enabling mixed rewards and diverse reasoning paths."}],"snapshot_sha256":"3ffb8fa9d7bb66134a9a6f0765ea5955adf48b82a0d3de57f7665dcf75a183fc"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2601.22478/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Group Relative Policy Optimization (GRPO) has become the dominant method for reinforcement learning with verifiable rewards in large language models, but it suffers from two critical limitations: gradient vanishing and diversity collapse. When training questions are too easy or too hard, all sampled responses receive identical rewards, yielding zero gradients. Meanwhile, the model tends to collapse its responses toward a single reasoning pattern rather than exploring diverse strategies. We propose Transformation-Augmented GRPO (TA-GRPO), a simple but effective method that addresses both issues","authors_text":"Chi-Heng Lin, Khiem Le, Nitesh V. Chawla, Phuc Nguyen, Shangqian Gao, Ting Hua, Youssef Mroueh","cross_cats":[],"headline":"Augmenting GRPO training with automatic rephrasings of each question improves pass rates on competition math and science benchmarks by enabling mixed rewards and diverse reasoning paths.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-30T02:43:29Z","title":"Transformation-Augmented GRPO for Enhancing Exploration in Reasoning of Large Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.22478","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T09:38:57.936853Z","id":"06326791-3016-4339-aaac-dcc9880fa27e","model_set":{"reader":"grok-4.3"},"one_line_summary":"TA-GRPO improves exploration in GRPO by rephrasing questions to mix rewards and reasoning paths, raising pass@32 scores by 4-5 points on math benchmarks while matching models trained on 2.5x more data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Augmenting GRPO training with automatic rephrasings of each question improves pass rates on competition math and science benchmarks by enabling mixed rewards and diverse reasoning paths.","strongest_claim":"TA-GRPO consistently improves pass@k on competition-level benchmarks (AMC, OlympiadBench, AIME24, AIME25) and out-of-distribution benchmarks (Minerva, GPQA-Diamond). Notably, it improves the average pass@32 of Qwen3-1.7B and Qwen3-4B by 4.97 and 4.34 points, respectively, and matches the exploration quality of baselines trained on up to 2.5× more data.","weakest_assumption":"The automatically generated rephrasings preserve semantic equivalence while meaningfully shifting the model's perceived difficulty, and that aligning importance ratios to the original question while computing advantages over the pooled set produces stable and beneficial policy updates without introducing bias or instability."}},"verdict_id":"06326791-3016-4339-aaac-dcc9880fa27e"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:536a3e756cd439365dec9401d4d4769940d40d4e67910929218b0831ab7806bf","target":"record","created_at":"2026-05-20T00:03:03Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"629044055f32005f5e10844de48f60721ced72c81ec6a8d238e98998d4435d5a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-30T02:43:29Z","title_canon_sha256":"97d11adf4cf025a2b6cffb03864adc93d7e01fb7365007886fd97144780b2129"},"schema_version":"1.0","source":{"id":"2601.22478","kind":"arxiv","version":4}},"canonical_sha256":"230e27307abe448c5b6259c0616f438ace50ec874fdb19ca78e4f48447a49b66","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"230e27307abe448c5b6259c0616f438ace50ec874fdb19ca78e4f48447a49b66","first_computed_at":"2026-05-20T00:03:03.641485Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:03:03.641485Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"fTGdap+3zlTsZ2zqrUlCDCmwG7D0ohzKKKkKIwcIE6cZCCoOsZ9IeSZVxAWYPcQn2L1hJahMs00JEl5lExVmCA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:03:03.642328Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.22478","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:536a3e756cd439365dec9401d4d4769940d40d4e67910929218b0831ab7806bf","sha256:2de496f6ec0d30a97faa0f815539783661cd84221f31bf49061d984054d81039"],"state_sha256":"ee6c63382ec31c4fa1f06a7ff40b001f58544a25c2edd6bd6664b709ca7f717d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"K/RchPYiwHu/8R4IZSwWC1X60L3VKmA7NItLrN5vbNUO6YKux7VDPGxnDyu7SRlmHBDM12iselRSPuyTeQCCBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T01:36:55.627003Z","bundle_sha256":"a0de7e2d6a686123e858d34ec68e749732b0b22bd354f56a4a11cc5e2aece7b2"}}