{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:CE5ZU6SWGIGY2PET4BES5OKHEF","short_pith_number":"pith:CE5ZU6SW","schema_version":"1.0","canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","source":{"kind":"arxiv","id":"2602.04265","version":3},"attestation_state":"computed","paper":{"title":"Boosting LLM Reasoning via Human-Inspired Reward Shaping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Gao Huang, Wenze Lin, Xiaoteng Ma, Xitai Jiang, Zhen Yang","submitted_at":"2026-02-04T06:55:58Z","abstract_excerpt":"Reinforcement Learning with Verifiable Rewards (RLVR) has emerged as a promising paradigm for enhancing reasoning in Large Language Models (LLMs). However, existing reward formulations typically treat exploration and consolidation as a monolithic process, resulting in entangled stage-wise learning dynamics. This contradicts the natural learning behavior of human learners. In human learning, individuals adopt distinct behavioral patterns toward mastered versus unfamiliar problems. When confronting unmastered challenges, humans prioritize broad exploration to seek viable solutions. By contrast, "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":true},"canonical_record":{"source":{"id":"2602.04265","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-04T06:55:58Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c48d32d0cf21a054412cc2fd9d973c7771bf25b2909c0a018c5029eb86fbb10b","abstract_canon_sha256":"ea21a5de64b75dabca5c7e6a2f4f5b3acb3e56537ed3ceb5d164e8b737812ba7"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:16.377194Z","signature_b64":"nhWV2jHbosriPCUQs/0RVdbBky3bPGvwB6Jzs5e9Mb9gPWaPrad1thsjuCQ8s45wTcTpqC1bs19bcx4u+fDMAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","last_reissued_at":"2026-05-17T23:39:16.376509Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:16.376509Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Boosting LLM Reasoning via Human-Inspired Reward Shaping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Gao Huang, Wenze Lin, Xiaoteng Ma, Xitai Jiang, Zhen Yang","submitted_at":"2026-02-04T06:55:58Z","abstract_excerpt":"Reinforcement Learning with Verifiable Rewards (RLVR) has emerged as a promising paradigm for enhancing reasoning in Large Language Models (LLMs). However, existing reward formulations typically treat exploration and consolidation as a monolithic process, resulting in entangled stage-wise learning dynamics. This contradicts the natural learning behavior of human learners. In human learning, individuals adopt distinct behavioral patterns toward mastered versus unfamiliar problems. When confronting unmastered challenges, humans prioritize broad exploration to seek viable solutions. By contrast, "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"T2T significantly outperforms standard GRPO and recent baselines, achieving superior performance on mathematical benchmarks (MATH-500, AIME, AMC) across 5 mainstream LLMs.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the dual-phase thickening-to-thinning reward mechanism, motivated by human learning patterns, will reliably translate into improved LLM reasoning without introducing training instabilities or overfitting to the specific benchmarks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"T2T reward framework boosts LLM math reasoning performance by shifting from exploration incentives on errors to length penalties on successes across multiple models and benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"beeea65cf273f36fc72c1c6c74dc36e8242fbf05d12243c35338c3b7f5e73825"},"source":{"id":"2602.04265","kind":"arxiv","version":3},"verdict":{"id":"ad428ae1-ab3f-448f-be8c-d6f2998809c2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T07:15:49.329239Z","strongest_claim":"T2T significantly outperforms standard GRPO and recent baselines, achieving superior performance on mathematical benchmarks (MATH-500, AIME, AMC) across 5 mainstream LLMs.","one_line_summary":"T2T reward framework boosts LLM math reasoning performance by shifting from exploration incentives on errors to length penalties on successes across multiple models and benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the dual-phase thickening-to-thinning reward mechanism, motivated by human learning patterns, will reliably translate into improved LLM reasoning without introducing training instabilities or overfitting to the specific benchmarks.","pith_extraction_headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"72d6647dcffe1f4be8603120f2e9b7c15bd32f9ed444df64920a924459ec5c62"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.04265","created_at":"2026-05-17T23:39:16.376634+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.04265v3","created_at":"2026-05-17T23:39:16.376634+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.04265","created_at":"2026-05-17T23:39:16.376634+00:00"},{"alias_kind":"pith_short_12","alias_value":"CE5ZU6SWGIGY","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"CE5ZU6SWGIGY2PET","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"CE5ZU6SW","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2604.17328","citing_title":"Rethinking the Comparison Unit in Sequence-Level Reinforcement Learning: An Equal-Length Paired Training Framework from Loss Correction to Sample Construction","ref_index":23,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF","json":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF.json","graph_json":"https://pith.science/api/pith-number/CE5ZU6SWGIGY2PET4BES5OKHEF/graph.json","events_json":"https://pith.science/api/pith-number/CE5ZU6SWGIGY2PET4BES5OKHEF/events.json","paper":"https://pith.science/paper/CE5ZU6SW"},"agent_actions":{"view_html":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF","download_json":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF.json","view_paper":"https://pith.science/paper/CE5ZU6SW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.04265&json=true","fetch_graph":"https://pith.science/api/pith-number/CE5ZU6SWGIGY2PET4BES5OKHEF/graph.json","fetch_events":"https://pith.science/api/pith-number/CE5ZU6SWGIGY2PET4BES5OKHEF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/action/storage_attestation","attest_author":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/action/author_attestation","sign_citation":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/action/citation_signature","submit_replication":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/action/replication_record"}},"created_at":"2026-05-17T23:39:16.376634+00:00","updated_at":"2026-05-17T23:39:16.376634+00:00"}