{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:DCUGBOVH4X7AOLTNYRNNTYVDSM","short_pith_number":"pith:DCUGBOVH","canonical_record":{"source":{"id":"2504.16084","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-04-22T17:59:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d4d589bc9a2a2b582c5f2586c59afb84a0b79a0a71a479fb110208a5754aaf83","abstract_canon_sha256":"a67ddc1e54252959d60b587e637eac83185a9cf4d0ef9e415b12b836eb2efb19"},"schema_version":"1.0"},"canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","source":{"kind":"arxiv","id":"2504.16084","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.16084","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"arxiv_version","alias_value":"2504.16084v3","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.16084","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"pith_short_12","alias_value":"DCUGBOVH4X7A","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"DCUGBOVH4X7AOLTN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"DCUGBOVH","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:DCUGBOVH4X7AOLTNYRNNTYVDSM","target":"record","payload":{"canonical_record":{"source":{"id":"2504.16084","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-04-22T17:59:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"d4d589bc9a2a2b582c5f2586c59afb84a0b79a0a71a479fb110208a5754aaf83","abstract_canon_sha256":"a67ddc1e54252959d60b587e637eac83185a9cf4d0ef9e415b12b836eb2efb19"},"schema_version":"1.0"},"canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:19.659443Z","signature_b64":"+w95xRQ2mbYJZTC8LIVkyxm7AUvSM52CTPqcjGRVJWx32uJe2dOtjGUfDG2B6FSRRFtfoJ80fVBmN69JYbj6Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","last_reissued_at":"2026-05-17T23:39:19.658694Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:19.658694Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2504.16084","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:19Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PlLHXtqzE0NgM78van8GnZ19SANcNHT07FipUzw5UrX+8sWkdPVyiZ0WWjgNG3+yczHsXouGK6NLCaXkE7PRCg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T02:52:37.470549Z"},"content_sha256":"60ccfdecb1471c0eb905b1cbccf6bf73c39fc3cdae9f629237a7f0d7846ef49c","schema_version":"1.0","event_id":"sha256:60ccfdecb1471c0eb905b1cbccf6bf73c39fc3cdae9f629237a7f0d7846ef49c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:DCUGBOVH4X7AOLTNYRNNTYVDSM","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"TTRL: Test-Time Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Biqing Qi, Bowen Zhou, Ermo Hua, Ganqu Cui, Haozhan Li, Kaiyan Zhang, Lifan Yuan, Li Sheng, Ning Ding, Shang Qu, Xinwei Long, Xuekai Zhu, Youbang Sun, Yuchen Zhang, Yuxin Zuo, Zhiyuan Ma","submitted_at":"2025-04-22T17:59:56Z","abstract_excerpt":"This paper investigates Reinforcement Learning (RL) on data without explicit labels for reasoning tasks in Large Language Models (LLMs). The core challenge of the problem is reward estimation during inference while not having access to ground-truth information. While this setting appears elusive, we find that common practices in Test-Time Scaling (TTS), such as majority voting, yield surprisingly effective rewards suitable for driving RL training. In this work, we introduce Test-Time Reinforcement Learning (TTRL), a novel method for training LLMs using RL on unlabeled data. TTRL enables self-e"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"TTRL boosts the pass@1 performance of Qwen-2.5-Math-7B by approximately 211% on the AIME 2024 with only unlabeled test data. Furthermore, although TTRL is only supervised by the maj@n metric, TTRL has demonstrated performance to consistently surpass the upper limit of the initial model maj@n.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"Common practices in Test-Time Scaling, such as majority voting, yield surprisingly effective rewards suitable for driving RL training on data without explicit labels.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"TTRL lets LLMs self-improve on reasoning tasks via RL driven by majority-voting rewards from unlabeled test data, yielding large gains such as a 211% boost in pass@1 on AIME 2024.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b59312f85ef1d3eb8c91280bd8752f2a9c26992921476622b4181ca0b5851143"},"source":{"id":"2504.16084","kind":"arxiv","version":3},"verdict":{"id":"336da23a-1ce1-4b44-804b-e96fa496d0c5","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T00:42:42.815218Z","strongest_claim":"TTRL boosts the pass@1 performance of Qwen-2.5-Math-7B by approximately 211% on the AIME 2024 with only unlabeled test data. Furthermore, although TTRL is only supervised by the maj@n metric, TTRL has demonstrated performance to consistently surpass the upper limit of the initial model maj@n.","one_line_summary":"TTRL lets LLMs self-improve on reasoning tasks via RL driven by majority-voting rewards from unlabeled test data, yielding large gains such as a 211% boost in pass@1 on AIME 2024.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"Common practices in Test-Time Scaling, such as majority voting, yield surprisingly effective rewards suitable for driving RL training on data without explicit labels.","pith_extraction_headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e242438502ebe4e0f4fb40b59a2191b716814a2513a7855613be3d56201828e4"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"336da23a-1ce1-4b44-804b-e96fa496d0c5"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:19Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Uy1DkFRwtcRFbgG46ptdFkIG8uWXGngZKnmeuqQqH1SejmopwqH7WPn4BD5EBXJIJydwpochFZqfZ6feh8TjCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T02:52:37.470993Z"},"content_sha256":"b9d56fe675ecaa5fb79700294cc30f28b30c0346a87979cf1f6f6b5746aaed9f","schema_version":"1.0","event_id":"sha256:b9d56fe675ecaa5fb79700294cc30f28b30c0346a87979cf1f6f6b5746aaed9f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/bundle.json","state_url":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T02:52:37Z","links":{"resolver":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM","bundle":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/bundle.json","state":"https://pith.science/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DCUGBOVH4X7AOLTNYRNNTYVDSM/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:DCUGBOVH4X7AOLTNYRNNTYVDSM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a67ddc1e54252959d60b587e637eac83185a9cf4d0ef9e415b12b836eb2efb19","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-04-22T17:59:56Z","title_canon_sha256":"d4d589bc9a2a2b582c5f2586c59afb84a0b79a0a71a479fb110208a5754aaf83"},"schema_version":"1.0","source":{"id":"2504.16084","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.16084","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"arxiv_version","alias_value":"2504.16084v3","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.16084","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"pith_short_12","alias_value":"DCUGBOVH4X7A","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"DCUGBOVH4X7AOLTN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"DCUGBOVH","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b9d56fe675ecaa5fb79700294cc30f28b30c0346a87979cf1f6f6b5746aaed9f","target":"graph","created_at":"2026-05-17T23:39:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"TTRL boosts the pass@1 performance of Qwen-2.5-Math-7B by approximately 211% on the AIME 2024 with only unlabeled test data. Furthermore, although TTRL is only supervised by the maj@n metric, TTRL has demonstrated performance to consistently surpass the upper limit of the initial model maj@n."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Common practices in Test-Time Scaling, such as majority voting, yield surprisingly effective rewards suitable for driving RL training on data without explicit labels."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"TTRL lets LLMs self-improve on reasoning tasks via RL driven by majority-voting rewards from unlabeled test data, yielding large gains such as a 211% boost in pass@1 on AIME 2024."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward."}],"snapshot_sha256":"b59312f85ef1d3eb8c91280bd8752f2a9c26992921476622b4181ca0b5851143"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e242438502ebe4e0f4fb40b59a2191b716814a2513a7855613be3d56201828e4"},"paper":{"abstract_excerpt":"This paper investigates Reinforcement Learning (RL) on data without explicit labels for reasoning tasks in Large Language Models (LLMs). The core challenge of the problem is reward estimation during inference while not having access to ground-truth information. While this setting appears elusive, we find that common practices in Test-Time Scaling (TTS), such as majority voting, yield surprisingly effective rewards suitable for driving RL training. In this work, we introduce Test-Time Reinforcement Learning (TTRL), a novel method for training LLMs using RL on unlabeled data. TTRL enables self-e","authors_text":"Biqing Qi, Bowen Zhou, Ermo Hua, Ganqu Cui, Haozhan Li, Kaiyan Zhang, Lifan Yuan, Li Sheng, Ning Ding, Shang Qu, Xinwei Long, Xuekai Zhu, Youbang Sun, Yuchen Zhang, Yuxin Zuo, Zhiyuan Ma","cross_cats":["cs.LG"],"headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-04-22T17:59:56Z","title":"TTRL: Test-Time Reinforcement Learning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.16084","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T00:42:42.815218Z","id":"336da23a-1ce1-4b44-804b-e96fa496d0c5","model_set":{"reader":"grok-4.3"},"one_line_summary":"TTRL lets LLMs self-improve on reasoning tasks via RL driven by majority-voting rewards from unlabeled test data, yielding large gains such as a 211% boost in pass@1 on AIME 2024.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"TTRL lets LLMs improve reasoning on unlabeled test data by treating majority voting as an RL reward.","strongest_claim":"TTRL boosts the pass@1 performance of Qwen-2.5-Math-7B by approximately 211% on the AIME 2024 with only unlabeled test data. Furthermore, although TTRL is only supervised by the maj@n metric, TTRL has demonstrated performance to consistently surpass the upper limit of the initial model maj@n.","weakest_assumption":"Common practices in Test-Time Scaling, such as majority voting, yield surprisingly effective rewards suitable for driving RL training on data without explicit labels."}},"verdict_id":"336da23a-1ce1-4b44-804b-e96fa496d0c5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:60ccfdecb1471c0eb905b1cbccf6bf73c39fc3cdae9f629237a7f0d7846ef49c","target":"record","created_at":"2026-05-17T23:39:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a67ddc1e54252959d60b587e637eac83185a9cf4d0ef9e415b12b836eb2efb19","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-04-22T17:59:56Z","title_canon_sha256":"d4d589bc9a2a2b582c5f2586c59afb84a0b79a0a71a479fb110208a5754aaf83"},"schema_version":"1.0","source":{"id":"2504.16084","kind":"arxiv","version":3}},"canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"18a860baa7e5fe072e6dc45ad9e2a3933b005082d791b4c1d229328c30a609cf","first_computed_at":"2026-05-17T23:39:19.658694Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:19.658694Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"+w95xRQ2mbYJZTC8LIVkyxm7AUvSM52CTPqcjGRVJWx32uJe2dOtjGUfDG2B6FSRRFtfoJ80fVBmN69JYbj6Aw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:19.659443Z","signed_message":"canonical_sha256_bytes"},"source_id":"2504.16084","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:60ccfdecb1471c0eb905b1cbccf6bf73c39fc3cdae9f629237a7f0d7846ef49c","sha256:b9d56fe675ecaa5fb79700294cc30f28b30c0346a87979cf1f6f6b5746aaed9f"],"state_sha256":"9f3bc3a34d24cc3e8b2d73b834adc20b98c325a8ec39b1aefe1b29b54f0bbb75"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"JbSS7a+v9X6Ur6L0dqWMdq9LayRWt1ypxs8XXZySa9G4Do0io9+Y3g+pc2yH1zdpT+Wr9OKGTcbhc+D0ta8hAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T02:52:37.473310Z","bundle_sha256":"3496558cdddc98d51f85cbea6a9c34495ab650693278cd7bb387a722bf380aa8"}}