{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:CE5ZU6SWGIGY2PET4BES5OKHEF","short_pith_number":"pith:CE5ZU6SW","canonical_record":{"source":{"id":"2602.04265","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-04T06:55:58Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c48d32d0cf21a054412cc2fd9d973c7771bf25b2909c0a018c5029eb86fbb10b","abstract_canon_sha256":"ea21a5de64b75dabca5c7e6a2f4f5b3acb3e56537ed3ceb5d164e8b737812ba7"},"schema_version":"1.0"},"canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","source":{"kind":"arxiv","id":"2602.04265","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.04265","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"arxiv_version","alias_value":"2602.04265v3","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.04265","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"pith_short_12","alias_value":"CE5ZU6SWGIGY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"CE5ZU6SWGIGY2PET","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"CE5ZU6SW","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:CE5ZU6SWGIGY2PET4BES5OKHEF","target":"record","payload":{"canonical_record":{"source":{"id":"2602.04265","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-04T06:55:58Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c48d32d0cf21a054412cc2fd9d973c7771bf25b2909c0a018c5029eb86fbb10b","abstract_canon_sha256":"ea21a5de64b75dabca5c7e6a2f4f5b3acb3e56537ed3ceb5d164e8b737812ba7"},"schema_version":"1.0"},"canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:16.377194Z","signature_b64":"nhWV2jHbosriPCUQs/0RVdbBky3bPGvwB6Jzs5e9Mb9gPWaPrad1thsjuCQ8s45wTcTpqC1bs19bcx4u+fDMAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","last_reissued_at":"2026-05-17T23:39:16.376509Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:16.376509Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.04265","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OksW8kPXLCkoHjUgU0agdZTxqfiBeZ6ugNTsj31GyU7ULv2ctHnqJuJr64JTAgfBEXYV9nQWyrrCSzThsPh1BA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T01:15:41.448890Z"},"content_sha256":"8ba1351342143db6af7cf4dc04bea163c7ef15412494522db237a27c61c90d45","schema_version":"1.0","event_id":"sha256:8ba1351342143db6af7cf4dc04bea163c7ef15412494522db237a27c61c90d45"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:CE5ZU6SWGIGY2PET4BES5OKHEF","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Boosting LLM Reasoning via Human-Inspired Reward Shaping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Gao Huang, Wenze Lin, Xiaoteng Ma, Xitai Jiang, Zhen Yang","submitted_at":"2026-02-04T06:55:58Z","abstract_excerpt":"Reinforcement Learning with Verifiable Rewards (RLVR) has emerged as a promising paradigm for enhancing reasoning in Large Language Models (LLMs). However, existing reward formulations typically treat exploration and consolidation as a monolithic process, resulting in entangled stage-wise learning dynamics. This contradicts the natural learning behavior of human learners. In human learning, individuals adopt distinct behavioral patterns toward mastered versus unfamiliar problems. When confronting unmastered challenges, humans prioritize broad exploration to seek viable solutions. By contrast, "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"T2T significantly outperforms standard GRPO and recent baselines, achieving superior performance on mathematical benchmarks (MATH-500, AIME, AMC) across 5 mainstream LLMs.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the dual-phase thickening-to-thinning reward mechanism, motivated by human learning patterns, will reliably translate into improved LLM reasoning without introducing training instabilities or overfitting to the specific benchmarks.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"T2T reward framework boosts LLM math reasoning performance by shifting from exploration incentives on errors to length penalties on successes across multiple models and benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"beeea65cf273f36fc72c1c6c74dc36e8242fbf05d12243c35338c3b7f5e73825"},"source":{"id":"2602.04265","kind":"arxiv","version":3},"verdict":{"id":"ad428ae1-ab3f-448f-be8c-d6f2998809c2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T07:15:49.329239Z","strongest_claim":"T2T significantly outperforms standard GRPO and recent baselines, achieving superior performance on mathematical benchmarks (MATH-500, AIME, AMC) across 5 mainstream LLMs.","one_line_summary":"T2T reward framework boosts LLM math reasoning performance by shifting from exploration incentives on errors to length penalties on successes across multiple models and benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the dual-phase thickening-to-thinning reward mechanism, motivated by human learning patterns, will reliably translate into improved LLM reasoning without introducing training instabilities or overfitting to the specific benchmarks.","pith_extraction_headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"72d6647dcffe1f4be8603120f2e9b7c15bd32f9ed444df64920a924459ec5c62"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ad428ae1-ab3f-448f-be8c-d6f2998809c2"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VjJQxi6TH1Go/aHaeUJh5r/UB6nYpZrRZx67tM36NQoxiye7xGcROumA6dYZnrINXPR5pZwraBGFDW78c6QDCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T01:15:41.449326Z"},"content_sha256":"e162494666bda02ae81ac60bcdcfbd1eb4fb7c068ec4643ce89bf8bb508558d9","schema_version":"1.0","event_id":"sha256:e162494666bda02ae81ac60bcdcfbd1eb4fb7c068ec4643ce89bf8bb508558d9"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/bundle.json","state_url":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T01:15:41Z","links":{"resolver":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF","bundle":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/bundle.json","state":"https://pith.science/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/state.json","well_known_bundle":"https://pith.science/.well-known/pith/CE5ZU6SWGIGY2PET4BES5OKHEF/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:CE5ZU6SWGIGY2PET4BES5OKHEF","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ea21a5de64b75dabca5c7e6a2f4f5b3acb3e56537ed3ceb5d164e8b737812ba7","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-04T06:55:58Z","title_canon_sha256":"c48d32d0cf21a054412cc2fd9d973c7771bf25b2909c0a018c5029eb86fbb10b"},"schema_version":"1.0","source":{"id":"2602.04265","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.04265","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"arxiv_version","alias_value":"2602.04265v3","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.04265","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"pith_short_12","alias_value":"CE5ZU6SWGIGY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"CE5ZU6SWGIGY2PET","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"CE5ZU6SW","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:e162494666bda02ae81ac60bcdcfbd1eb4fb7c068ec4643ce89bf8bb508558d9","target":"graph","created_at":"2026-05-17T23:39:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"T2T significantly outperforms standard GRPO and recent baselines, achieving superior performance on mathematical benchmarks (MATH-500, AIME, AMC) across 5 mainstream LLMs."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the dual-phase thickening-to-thinning reward mechanism, motivated by human learning patterns, will reliably translate into improved LLM reasoning without introducing training instabilities or overfitting to the specific benchmarks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"T2T reward framework boosts LLM math reasoning performance by shifting from exploration incentives on errors to length penalties on successes across multiple models and benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation."}],"snapshot_sha256":"beeea65cf273f36fc72c1c6c74dc36e8242fbf05d12243c35338c3b7f5e73825"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"72d6647dcffe1f4be8603120f2e9b7c15bd32f9ed444df64920a924459ec5c62"},"paper":{"abstract_excerpt":"Reinforcement Learning with Verifiable Rewards (RLVR) has emerged as a promising paradigm for enhancing reasoning in Large Language Models (LLMs). However, existing reward formulations typically treat exploration and consolidation as a monolithic process, resulting in entangled stage-wise learning dynamics. This contradicts the natural learning behavior of human learners. In human learning, individuals adopt distinct behavioral patterns toward mastered versus unfamiliar problems. When confronting unmastered challenges, humans prioritize broad exploration to seek viable solutions. By contrast, ","authors_text":"Gao Huang, Wenze Lin, Xiaoteng Ma, Xitai Jiang, Zhen Yang","cross_cats":["cs.AI"],"headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-04T06:55:58Z","title":"Boosting LLM Reasoning via Human-Inspired Reward Shaping"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.04265","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T07:15:49.329239Z","id":"ad428ae1-ab3f-448f-be8c-d6f2998809c2","model_set":{"reader":"grok-4.3"},"one_line_summary":"T2T reward framework boosts LLM math reasoning performance by shifting from exploration incentives on errors to length penalties on successes across multiple models and benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"T2T dual-phase rewards improve LLM math reasoning by shifting from broad exploration to concise condensation.","strongest_claim":"T2T significantly outperforms standard GRPO and recent baselines, achieving superior performance on mathematical benchmarks (MATH-500, AIME, AMC) across 5 mainstream LLMs.","weakest_assumption":"The assumption that the dual-phase thickening-to-thinning reward mechanism, motivated by human learning patterns, will reliably translate into improved LLM reasoning without introducing training instabilities or overfitting to the specific benchmarks."}},"verdict_id":"ad428ae1-ab3f-448f-be8c-d6f2998809c2"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8ba1351342143db6af7cf4dc04bea163c7ef15412494522db237a27c61c90d45","target":"record","created_at":"2026-05-17T23:39:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ea21a5de64b75dabca5c7e6a2f4f5b3acb3e56537ed3ceb5d164e8b737812ba7","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-04T06:55:58Z","title_canon_sha256":"c48d32d0cf21a054412cc2fd9d973c7771bf25b2909c0a018c5029eb86fbb10b"},"schema_version":"1.0","source":{"id":"2602.04265","kind":"arxiv","version":3}},"canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"113b9a7a56320d8d3c93e0492eb947215bbb33dc2317939fdece3e5c3e4780be","first_computed_at":"2026-05-17T23:39:16.376509Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:16.376509Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nhWV2jHbosriPCUQs/0RVdbBky3bPGvwB6Jzs5e9Mb9gPWaPrad1thsjuCQ8s45wTcTpqC1bs19bcx4u+fDMAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:16.377194Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.04265","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8ba1351342143db6af7cf4dc04bea163c7ef15412494522db237a27c61c90d45","sha256:e162494666bda02ae81ac60bcdcfbd1eb4fb7c068ec4643ce89bf8bb508558d9"],"state_sha256":"9e348db13af7ce927ae50e1c38a45c749867d679a5f065a3a81e4043809a6a1a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dt2j2yhw+Adk2d1HFTN1TNmSHrdvXobOOPFxyJCT8gMtVTCt2X3jPrR1wbfxQTkUEFOPFlySbfYs2BseXzJlCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T01:15:41.451402Z","bundle_sha256":"c4b5a2bc9d1ecaca9b82f8b714ff5e6a5b9dbb67fb1012f6ed55fa76afc5a20e"}}