{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:Q7OXT2D7TV2U5BL2A5ITHAAZGE","short_pith_number":"pith:Q7OXT2D7","schema_version":"1.0","canonical_sha256":"87dd79e87f9d754e857a0751338019311f6aa80cf62ca2dad3f15188522b86b2","source":{"kind":"arxiv","id":"2502.14768","version":1},"attestation_state":"computed","paper":{"title":"Logic-RL: Unleashing LLM Reasoning with Rule-Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Rule-based RL on 5K logic puzzles induces reflection and verification in a 7B model that transfers to AIME and AMC.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Bryan Dai, Chong Luo, Haoming Luo, Joey Zhou, Kai Qiu, Qingnan Ren, Tian Xie, Yuqian Hong, Zhirong Wu, Zitian Gao","submitted_at":"2025-02-20T17:49:26Z","abstract_excerpt":"Inspired by the success of DeepSeek-R1, we explore the potential of rule-based reinforcement learning (RL) in large reasoning models. To analyze reasoning dynamics, we use synthetic logic puzzles as training data due to their controllable complexity and straightforward answer verification. We make some key technical contributions that lead to effective and stable RL training: a system prompt that emphasizes the thinking and answering process, a stringent format reward function that penalizes outputs for taking shortcuts, and a straightforward training recipe that achieves stable convergence. O"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2502.14768","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-20T17:49:26Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"58ca94316949335c21db4792a681216bb1a96c1f2781187232e95607a0904f69","abstract_canon_sha256":"99e9eb6e20d4e54ae62f41a827a7d314d315ad5ae79695f60a26a1bedff501b7"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.596356Z","signature_b64":"L267w2kGeCg1d97CPeeBq0m0kBwVJCuoCjEt7BiW87+1QsPUi2Dl2vDD0EkrRzzmFm2ucBtBODOKXgvkBL2MDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"87dd79e87f9d754e857a0751338019311f6aa80cf62ca2dad3f15188522b86b2","last_reissued_at":"2026-05-17T23:38:46.595890Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.595890Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Logic-RL: Unleashing LLM Reasoning with Rule-Based Reinforcement Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Rule-based RL on 5K logic puzzles induces reflection and verification in a 7B model that transfers to AIME and AMC.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Bryan Dai, Chong Luo, Haoming Luo, Joey Zhou, Kai Qiu, Qingnan Ren, Tian Xie, Yuqian Hong, Zhirong Wu, Zitian Gao","submitted_at":"2025-02-20T17:49:26Z","abstract_excerpt":"Inspired by the success of DeepSeek-R1, we explore the potential of rule-based reinforcement learning (RL) in large reasoning models. To analyze reasoning dynamics, we use synthetic logic puzzles as training data due to their controllable complexity and straightforward answer verification. We make some key technical contributions that lead to effective and stable RL training: a system prompt that emphasizes the thinking and answering process, a stringent format reward function that penalizes outputs for taking shortcuts, and a straightforward training recipe that achieves stable convergence. O"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"after training on just 5K logic problems, it demonstrates generalization abilities to the challenging math benchmarks AIME and AMC.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the advanced reasoning behaviors (reflection, verification, summarization) are induced by the RL process rather than already latent in the base 7B model or triggered by the system prompt alone.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Rule-based RL on 5K logic puzzles induces advanced reasoning in a 7B model that transfers to AIME and AMC.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Rule-based RL on 5K logic puzzles induces reflection and verification in a 7B model that transfers to AIME and AMC.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0d6704e1857d5c09f9ba6ab79da66c16486e1a231d2bfd5d7fdedfa928e0cbc5"},"source":{"id":"2502.14768","kind":"arxiv","version":1},"verdict":{"id":"1f47f35f-6ebc-4a36-8323-0b024479609f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T20:54:31.737562Z","strongest_claim":"after training on just 5K logic problems, it demonstrates generalization abilities to the challenging math benchmarks AIME and AMC.","one_line_summary":"Rule-based RL on 5K logic puzzles induces advanced reasoning in a 7B model that transfers to AIME and AMC.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the advanced reasoning behaviors (reflection, verification, summarization) are induced by the RL process rather than already latent in the base 7B model or triggered by the system prompt alone.","pith_extraction_headline":"Rule-based RL on 5K logic puzzles induces reflection and verification in a 7B model that transfers to AIME and AMC."},"references":{"count":27,"sample":[{"doi":"","year":2025,"title":"Le, Sergey Levine, and Yi Ma","work_id":"ad2bcec1-178f-4281-a602-c683a5711e90","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Training verifiers to solve math word problems","work_id":"1f7ff91e-3d16-4cba-ad18-3c6ae7ec674b","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"DeepSeek-AI, Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, Xiaokang Zhang, Xingkai Yu, Yu Wu, Z. F. Wu, Zhibin Gou, Zhihong Sha","work_id":"2472dd86-1219-472e-af5b-4fba94816f78","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Alphazero-like tree-search can guide large language model decoding and training, 2024","work_id":"b23b3d7c-5200-48e4-9a0c-344903052311","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Omni-math: A universal olympiad level mathematic benchmark for large language models, 2024","work_id":"53e56ad1-de9e-460f-8183-689e5f4b23d6","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":27,"snapshot_sha256":"bf59d0bbcfb1fdd16bbf679f7ba25ea9e77e3bff305477f0e0b128e2384651b7","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0e90cfa9e1b05ca97a7f7e4f53b66aa44d77108040460b51d770bceb21ae6745"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.14768","created_at":"2026-05-17T23:38:46.595973+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.14768v1","created_at":"2026-05-17T23:38:46.595973+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.14768","created_at":"2026-05-17T23:38:46.595973+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q7OXT2D7TV2U","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q7OXT2D7TV2U5BL2","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q7OXT2D7","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":33,"internal_anchor_count":33,"sample":[{"citing_arxiv_id":"2504.02181","citing_title":"A Survey of Scaling in Large Language Model Reasoning","ref_index":230,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22567","citing_title":"LANG: Reinforcement Learning for Multilingual Reasoning with Language-Adaptive Hint Guidance","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2507.01679","citing_title":"Blending Supervised and Reinforcement Fine-Tuning with Prefix Sampling","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20863","citing_title":"PlexRL: Cluster-Level Orchestration of Serviceized LLM Execution for RLVR","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06638","citing_title":"Can RL Teach Long-Horizon Reasoning to LLMs? Expressiveness Is Key","ref_index":106,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19447","citing_title":"What and When to Distill: Selective Hindsight Distillation for Multi-Turn Agents","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2508.07809","citing_title":"EvoCoT: Overcoming the Exploration Bottleneck in Reinforcement Learning","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2508.11548","citing_title":"Copyright Protection for Large Language Models: A Survey of Methods, Challenges, and Trends","ref_index":155,"is_internal_anchor":true},{"citing_arxiv_id":"2509.23330","citing_title":"Structured In-context Environment Scaling for Large Language Model Reasoning","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2510.06499","citing_title":"Webscale-RL: Automated Data Pipeline for Scaling RL Data to Pretraining Levels","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2511.00066","citing_title":"Sharpness-Guided Group Relative Policy Optimization via Probability Shaping","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2511.20814","citing_title":"SPHINX: A Synthetic Environment for Visual Perception and Reasoning","ref_index":61,"is_internal_anchor":true},{"citing_arxiv_id":"2512.03043","citing_title":"OneThinker: All-in-one Reasoning Model for Image and Video","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2512.07461","citing_title":"Native Parallel Reasoner: Reasoning in Parallelism via Self-Distilled Reinforcement Learning","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2504.21776","citing_title":"WebThinker: Empowering Large Reasoning Models with Deep Research Capability","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2506.10947","citing_title":"Spurious Rewards: Rethinking Training Signals in RLVR","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12645","citing_title":"Training LLMs with Reinforcement Learning for Intent-Aware Personalized Question Answering","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2504.13958","citing_title":"ToolRL: Reward is All Tool Learning Needs","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11922","citing_title":"StepCodeReasoner: Aligning Code Reasoning with Stepwise Execution Traces via Reinforcement Learning","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2502.17419","citing_title":"From System 1 to System 2: A Survey of Reasoning Large Language Models","ref_index":278,"is_internal_anchor":true},{"citing_arxiv_id":"2503.21776","citing_title":"Video-R1: Reinforcing Video Reasoning in MLLMs","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06638","citing_title":"Can RL Teach Long-Horizon Reasoning to LLMs? Expressiveness Is Key","ref_index":106,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08905","citing_title":"Forge: Quality-Aware Reinforcement Learning for NP-Hard Optimization in LLMs","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06638","citing_title":"Can RL Teach Long-Horizon Reasoning to LLMs? Expressiveness Is Key","ref_index":101,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19260","citing_title":"Understanding the Mechanism of Altruism in Large Language Models","ref_index":155,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE","json":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE.json","graph_json":"https://pith.science/api/pith-number/Q7OXT2D7TV2U5BL2A5ITHAAZGE/graph.json","events_json":"https://pith.science/api/pith-number/Q7OXT2D7TV2U5BL2A5ITHAAZGE/events.json","paper":"https://pith.science/paper/Q7OXT2D7"},"agent_actions":{"view_html":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE","download_json":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE.json","view_paper":"https://pith.science/paper/Q7OXT2D7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.14768&json=true","fetch_graph":"https://pith.science/api/pith-number/Q7OXT2D7TV2U5BL2A5ITHAAZGE/graph.json","fetch_events":"https://pith.science/api/pith-number/Q7OXT2D7TV2U5BL2A5ITHAAZGE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE/action/storage_attestation","attest_author":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE/action/author_attestation","sign_citation":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE/action/citation_signature","submit_replication":"https://pith.science/pith/Q7OXT2D7TV2U5BL2A5ITHAAZGE/action/replication_record"}},"created_at":"2026-05-17T23:38:46.595973+00:00","updated_at":"2026-05-17T23:38:46.595973+00:00"}