{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:DCUGBOVH4X7AOLTNYRNNTYVDSM","short_pith_number":"pith:DCUGBOVH","schema_version":"1.0","canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","source":{"kind":"arxiv","id":"2504.16084","version":3},"attestation_state":"computed","paper":{"title":"TTRL: Test-Time Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Biqing Qi, Bowen Zhou, Ermo Hua, Ganqu Cui, Haozhan Li, Kaiyan Zhang, Lifan Yuan, Li Sheng, Ning Ding, Shang Qu, Xinwei Long, Xuekai Zhu, Youbang Sun, Yuchen Zhang, Yuxin Zuo, Zhiyuan Ma","submitted_at":"2025-04-22T17:59:56Z","abstract_excerpt":"This paper investigates Reinforcement Learning (RL) on data without explicit labels for reasoning tasks in Large Language Models (LLMs). The core challenge of the problem is reward estimation during inference while not having access to ground-truth information. While this setting appears elusive, we find that common practices in Test-Time Scaling (TTS), such as majority voting, yield surprisingly effective rewards suitable for driving RL training. In this work, we introduce Test-Time Reinforcement Learning (TTRL), a novel method for training LLMs using RL on unlabeled data. TTRL enables self-e"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":true},"canonical_record":{"source":{"id":"2504.16084","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-04-22T17:59:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d4d589bc9a2a2b582c5f2586c59afb84a0b79a0a71a479fb110208a5754aaf83","abstract_canon_sha256":"a67ddc1e54252959d60b587e637eac83185a9cf4d0ef9e415b12b836eb2efb19"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:19.659443Z","signature_b64":"+w95xRQ2mbYJZTC8LIVkyxm7AUvSM52CTPqcjGRVJWx32uJe2dOtjGUfDG2B6FSRRFtfoJ80fVBmN69JYbj6Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","last_reissued_at":"2026-05-17T23:39:19.658694Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:19.658694Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"TTRL: Test-Time Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Biqing Qi, Bowen Zhou, Ermo Hua, Ganqu Cui, Haozhan Li, Kaiyan Zhang, Lifan Yuan, Li Sheng, Ning Ding, Shang Qu, Xinwei Long, Xuekai Zhu, Youbang Sun, Yuchen Zhang, Yuxin Zuo, Zhiyuan Ma","submitted_at":"2025-04-22T17:59:56Z","abstract_excerpt":"This paper investigates Reinforcement Learning (RL) on data without explicit labels for reasoning tasks in Large Language Models (LLMs). The core challenge of the problem is reward estimation during inference while not having access to ground-truth information. While this setting appears elusive, we find that common practices in Test-Time Scaling (TTS), such as majority voting, yield surprisingly effective rewards suitable for driving RL training. In this work, we introduce Test-Time Reinforcement Learning (TTRL), a novel method for training LLMs using RL on unlabeled data. TTRL enables self-e"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"TTRL boosts the pass@1 performance of Qwen-2.5-Math-7B by approximately 211% on the AIME 2024 with only unlabeled test data. Furthermore, although TTRL is only supervised by the maj@n metric, TTRL has demonstrated performance to consistently surpass the upper limit of the initial model maj@n.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"Common practices in Test-Time Scaling, such as majority voting, yield surprisingly effective rewards suitable for driving RL training on data without explicit labels.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"TTRL lets LLMs self-improve on reasoning tasks via RL driven by majority-voting rewards from unlabeled test data, yielding large gains such as a 211% boost in pass@1 on AIME 2024.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b59312f85ef1d3eb8c91280bd8752f2a9c26992921476622b4181ca0b5851143"},"source":{"id":"2504.16084","kind":"arxiv","version":3},"verdict":{"id":"336da23a-1ce1-4b44-804b-e96fa496d0c5","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T00:42:42.815218Z","strongest_claim":"TTRL boosts the pass@1 performance of Qwen-2.5-Math-7B by approximately 211% on the AIME 2024 with only unlabeled test data. Furthermore, although TTRL is only supervised by the maj@n metric, TTRL has demonstrated performance to consistently surpass the upper limit of the initial model maj@n.","one_line_summary":"TTRL lets LLMs self-improve on reasoning tasks via RL driven by majority-voting rewards from unlabeled test data, yielding large gains such as a 211% boost in pass@1 on AIME 2024.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"Common practices in Test-Time Scaling, such as majority voting, yield surprisingly effective rewards suitable for driving RL training on data without explicit labels.","pith_extraction_headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e242438502ebe4e0f4fb40b59a2191b716814a2513a7855613be3d56201828e4"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2504.16084","created_at":"2026-05-17T23:39:19.658827+00:00"},{"alias_kind":"arxiv_version","alias_value":"2504.16084v3","created_at":"2026-05-17T23:39:19.658827+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.16084","created_at":"2026-05-17T23:39:19.658827+00:00"},{"alias_kind":"pith_short_12","alias_value":"DCUGBOVH4X7A","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"DCUGBOVH4X7AOLTN","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"DCUGBOVH","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":41,"internal_anchor_count":21,"sample":[{"citing_arxiv_id":"2602.12579","citing_title":"VI-CuRL: Stabilizing Verifier-Independent RL Reasoning via Confidence-Guided Variance Reduction","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2508.08636","citing_title":"InternBootcamp Technical Report: Boosting LLM Reasoning with Verifiable Task Scaling","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2509.23183","citing_title":"ZeroSiam: An Efficient Asymmetry for Test-Time Entropy Optimization without Collapse","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20189","citing_title":"SOLAR: A Self-Optimizing Open-Ended Autonomous Agent for Lifelong Learning and Continual Adaptation","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20914","citing_title":"RISE: Reliable Improvement in Self-Evolving Vision-Language Models","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19444","citing_title":"Detecting and Mitigating the Correct-Answer Extinction Window in Test-Time Reinforcement Learning with Majority Voting","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2506.13351","citing_title":"Direct Reasoning Optimization: Token-Level Reasoning Reflectivity Meets Rubric Gates for Unverifiable Tasks","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2507.18809","citing_title":"Test-time Offline Reinforcement Learning on Goal-related Experience","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2509.02547","citing_title":"The Landscape of Agentic Reinforcement Learning for LLMs: A Survey","ref_index":175,"is_internal_anchor":true},{"citing_arxiv_id":"2505.15134","citing_title":"The Unreasonable Effectiveness of Entropy Minimization in LLM Reasoning","ref_index":103,"is_internal_anchor":true},{"citing_arxiv_id":"2509.14234","citing_title":"Compute as Teacher: Turning Inference Compute Into Reference-Free Supervision","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2511.09907","citing_title":"Learning to Pose Problems: Reasoning-Driven and Solver-Adaptive Data Synthesis","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2505.22312","citing_title":"Skywork Open Reasoner 1 Technical Report","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2512.09756","citing_title":"MOA: Multi-Objective Alignment for Role-Playing Agents","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2601.15808","citing_title":"Inference-Time Scaling of Verification: Self-Evolving Deep Research Agents via Test-Time Rubric-Guided Verification","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2601.16175","citing_title":"Learning to Discover at Test Time","ref_index":88,"is_internal_anchor":true},{"citing_arxiv_id":"2504.14945","citing_title":"Learning to Reason under Off-Policy Guidance","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2504.20571","citing_title":"Reinforcement Learning for Reasoning in Large Language Models with One Training Example","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2603.00918","citing_title":"Improving Text-to-Image Generation with Intrinsic Self-Confidence Rewards","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2603.03197","citing_title":"Specificity-aware reinforcement learning for fine-grained open-world classification","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22241","citing_title":"MemDLM: Memory-Enhanced DLM Training","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2603.27977","citing_title":"SARL: Label-Free Reinforcement Learning by Rewarding Reasoning Topology","ref_index":34,"is_internal_anchor":false},{"citing_arxiv_id":"2604.02721","citing_title":"GrandCode: Achieving Grandmaster Level in Competitive Programming via Agentic Reinforcement Learning","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2505.03335","citing_title":"Absolute Zero: Reinforced Self-play Reasoning with Zero Data","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2604.03993","citing_title":"Can LLMs Learn to Reason Robustly under Noisy Supervision?","ref_index":37,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM","json":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM.json","graph_json":"https://pith.science/api/pith-number/DCUGBOVH4X7AOLTNYRNNTYVDSM/graph.json","events_json":"https://pith.science/api/pith-number/DCUGBOVH4X7AOLTNYRNNTYVDSM/events.json","paper":"https://pith.science/paper/DCUGBOVH"},"agent_actions":{"view_html":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM","download_json":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM.json","view_paper":"https://pith.science/paper/DCUGBOVH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2504.16084&json=true","fetch_graph":"https://pith.science/api/pith-number/DCUGBOVH4X7AOLTNYRNNTYVDSM/graph.json","fetch_events":"https://pith.science/api/pith-number/DCUGBOVH4X7AOLTNYRNNTYVDSM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/action/storage_attestation","attest_author":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/action/author_attestation","sign_citation":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/action/citation_signature","submit_replication":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/action/replication_record"}},"created_at":"2026-05-17T23:39:19.658827+00:00","updated_at":"2026-05-17T23:39:19.658827+00:00"}