{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:BZ4MMIKSNS3Y2QU4GFWSFGHP7B","short_pith_number":"pith:BZ4MMIKS","schema_version":"1.0","canonical_sha256":"0e78c621526cb78d429c316d2298eff85c02c39ba301d1aa7a41fa2e45ffc6d1","source":{"kind":"arxiv","id":"2506.03106","version":7},"attestation_state":"computed","paper":{"title":"Critique-GRPO: Advancing LLM Reasoning with Natural Language and Numerical Feedback","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Chaochao Lu, Chao Yang, Hao Sun, Helen Meng, Kaituo Feng, Xiaoying Zhang, Yipeng Zhang","submitted_at":"2025-06-03T17:39:02Z","abstract_excerpt":"Recent advances in reinforcement learning (RL) using numerical rewards have significantly enhanced the complex reasoning capabilities of large language models (LLMs). However, we identify three fundamental limitations of purely numerical feedback: performance plateaus, ineffective spontaneous self-reflection, and persistent failures. We show that plateaued RL models can successfully refine failed solutions when given natural language critiques. Motivated by this, we propose Critique-GRPO, an online RL framework that integrates both natural language and numerical feedback for policy optimizatio"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2506.03106","kind":"arxiv","version":7},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-06-03T17:39:02Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a12936790398c6ef3d7016d900302e7c33ec652e8d9db26f46a16269c988ee43","abstract_canon_sha256":"8ad3cc14d6d5e64bf6199daf66dbcd63fc08045ff6f358bade462faea0fe7be6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:05:06.911709Z","signature_b64":"UEQrrNdw0Xi5qcXmPeXB119t7E3VyJtxDh6jyeFz08FeHAuw+mOt4jyMESjCTz9FUV6k/o9b1vKsuDXfuPkOAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0e78c621526cb78d429c316d2298eff85c02c39ba301d1aa7a41fa2e45ffc6d1","last_reissued_at":"2026-06-09T01:05:06.911024Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:05:06.911024Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Critique-GRPO: Advancing LLM Reasoning with Natural Language and Numerical Feedback","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Chaochao Lu, Chao Yang, Hao Sun, Helen Meng, Kaituo Feng, Xiaoying Zhang, Yipeng Zhang","submitted_at":"2025-06-03T17:39:02Z","abstract_excerpt":"Recent advances in reinforcement learning (RL) using numerical rewards have significantly enhanced the complex reasoning capabilities of large language models (LLMs). However, we identify three fundamental limitations of purely numerical feedback: performance plateaus, ineffective spontaneous self-reflection, and persistent failures. We show that plateaued RL models can successfully refine failed solutions when given natural language critiques. Motivated by this, we propose Critique-GRPO, an online RL framework that integrates both natural language and numerical feedback for policy optimizatio"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.03106","kind":"arxiv","version":7},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2506.03106/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2506.03106","created_at":"2026-06-09T01:05:06.911104+00:00"},{"alias_kind":"arxiv_version","alias_value":"2506.03106v7","created_at":"2026-06-09T01:05:06.911104+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.03106","created_at":"2026-06-09T01:05:06.911104+00:00"},{"alias_kind":"pith_short_12","alias_value":"BZ4MMIKSNS3Y","created_at":"2026-06-09T01:05:06.911104+00:00"},{"alias_kind":"pith_short_16","alias_value":"BZ4MMIKSNS3Y2QU4","created_at":"2026-06-09T01:05:06.911104+00:00"},{"alias_kind":"pith_short_8","alias_value":"BZ4MMIKS","created_at":"2026-06-09T01:05:06.911104+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":17,"sample":[{"citing_arxiv_id":"2603.28767","citing_title":"Gen-Searcher: Reinforcing Agentic Search for Image Generation","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20506","citing_title":"Reinforcing Human Behavior Simulation via Verbal Feedback","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18799","citing_title":"ReCrit: Transition-Aware Reinforcement Learning for Scientific Critic Reasoning","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18851","citing_title":"STRIDE: Learnable Stepwise Language Feedback for LLM Reasoning","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17877","citing_title":"PAIR: Prefix-Aware Internal Reward Model for Multi-Turn Agent Optimization","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15224","citing_title":"ICRL: Learning to Internalize Self-Critique with Reinforcement Learning","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2512.03043","citing_title":"OneThinker: All-in-one Reasoning Model for Image and Video","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2512.16918","citing_title":"AdaTooler-V: Adaptive Tool-Use for Images and Videos","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12652","citing_title":"Multi-Rollout On-Policy Distillation via Peer Successes and Failures","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28767","citing_title":"Gen-Searcher: Reinforcing Agentic Search for Image Generation","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06130","citing_title":"Skill1: Unified Evolution of Skill-Augmented Agents via Reinforcement Learning","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2503.21776","citing_title":"Video-R1: Reinforcing Video Reasoning in MLLMs","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08666","citing_title":"The Cancellation Hypothesis in Critic-Free RL: From Outcome Rewards to Token Credits","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23318","citing_title":"Hidden States Know Where Reasoning Diverges: Credit Assignment via Span-Level Wasserstein Distance","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06130","citing_title":"Skill1: Unified Evolution of Skill-Augmented Agents via Reinforcement Learning","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06130","citing_title":"Skill1: Unified Evolution of Skill-Augmented Agents via Reinforcement Learning","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07941","citing_title":"Large Language Model Post-Training: A Unified View of Off-Policy and On-Policy Learning","ref_index":106,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B","json":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B.json","graph_json":"https://pith.science/api/pith-number/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/graph.json","events_json":"https://pith.science/api/pith-number/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/events.json","paper":"https://pith.science/paper/BZ4MMIKS"},"agent_actions":{"view_html":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B","download_json":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B.json","view_paper":"https://pith.science/paper/BZ4MMIKS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2506.03106&json=true","fetch_graph":"https://pith.science/api/pith-number/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/graph.json","fetch_events":"https://pith.science/api/pith-number/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/action/storage_attestation","attest_author":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/action/author_attestation","sign_citation":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/action/citation_signature","submit_replication":"https://pith.science/pith/BZ4MMIKSNS3Y2QU4GFWSFGHP7B/action/replication_record"}},"created_at":"2026-06-09T01:05:06.911104+00:00","updated_at":"2026-06-09T01:05:06.911104+00:00"}