{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:OXXTT4OHRETB3KWT7POHNLCMVB","short_pith_number":"pith:OXXTT4OH","schema_version":"1.0","canonical_sha256":"75ef39f1c789261daad3fbdc76ac4ca85ed7cc2c883c7920c4c44a61de79c7fc","source":{"kind":"arxiv","id":"2405.14838","version":1},"attestation_state":"computed","paper":{"title":"From Explicit CoT to Implicit CoT: Learning to Internalize CoT Step by Step","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A progressive fine-tuning method lets language models internalize chain-of-thought steps so they can solve harder reasoning tasks without producing explicit intermediate outputs.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Stuart Shieber, Yejin Choi, Yuntian Deng","submitted_at":"2024-05-23T17:54:14Z","abstract_excerpt":"When leveraging language models for reasoning tasks, generating explicit chain-of-thought (CoT) steps often proves essential for achieving high accuracy in final outputs. In this paper, we investigate if models can be taught to internalize these CoT steps. To this end, we propose a simple yet effective method for internalizing CoT steps: starting with a model trained for explicit CoT reasoning, we gradually remove the intermediate steps and finetune the model. This process allows the model to internalize the intermediate reasoning steps, thus simplifying the reasoning process while maintaining"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2405.14838","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-05-23T17:54:14Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"f6d0526c0652c41c5abf4ecf9cb7fa95457b913a3cf89cd91c770badcf7b6236","abstract_canon_sha256":"496ee826e470ff34a2bfafe7c7e02e36c6b83c54da6d742b158091e822c24d23"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.988754Z","signature_b64":"Y89TJmU7R+MzZpECfk+/qPVnWAWsk4yn25QLPjmJkrYday8EMrhga91RK6c6k0zd5VTfkcSJlm/ptK9dlP0mAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"75ef39f1c789261daad3fbdc76ac4ca85ed7cc2c883c7920c4c44a61de79c7fc","last_reissued_at":"2026-05-17T23:38:47.988012Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.988012Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"From Explicit CoT to Implicit CoT: Learning to Internalize CoT Step by Step","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A progressive fine-tuning method lets language models internalize chain-of-thought steps so they can solve harder reasoning tasks without producing explicit intermediate outputs.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Stuart Shieber, Yejin Choi, Yuntian Deng","submitted_at":"2024-05-23T17:54:14Z","abstract_excerpt":"When leveraging language models for reasoning tasks, generating explicit chain-of-thought (CoT) steps often proves essential for achieving high accuracy in final outputs. In this paper, we investigate if models can be taught to internalize these CoT steps. To this end, we propose a simple yet effective method for internalizing CoT steps: starting with a model trained for explicit CoT reasoning, we gradually remove the intermediate steps and finetune the model. This process allows the model to internalize the intermediate reasoning steps, thus simplifying the reasoning process while maintaining"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our approach enables a GPT-2 Small model to solve 9-by-9 multiplication with up to 99% accuracy, whereas standard training cannot solve beyond 4-by-4 multiplication.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That performance gains arise specifically from internalizing the removed reasoning steps rather than from increased task exposure, regularization, or other side effects of the progressive fine-tuning schedule.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Gradual fine-tuning that removes explicit CoT steps lets GPT-2 Small reach 99% accuracy on 9x9 multiplication and Mistral 7B exceed 50% on GSM8K with no intermediate outputs.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A progressive fine-tuning method lets language models internalize chain-of-thought steps so they can solve harder reasoning tasks without producing explicit intermediate outputs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b7d5f4a5ada675bf73c55cb6f4ce0cd7d11f397250614a63507247c366ca2028"},"source":{"id":"2405.14838","kind":"arxiv","version":1},"verdict":{"id":"0d67b4be-0518-440e-80ad-15843605694f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T11:40:00.849486Z","strongest_claim":"Our approach enables a GPT-2 Small model to solve 9-by-9 multiplication with up to 99% accuracy, whereas standard training cannot solve beyond 4-by-4 multiplication.","one_line_summary":"Gradual fine-tuning that removes explicit CoT steps lets GPT-2 Small reach 99% accuracy on 9x9 multiplication and Mistral 7B exceed 50% on GSM8K with no intermediate outputs.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That performance gains arise specifically from internalizing the removed reasoning steps rather than from increased task exposure, regularization, or other side effects of the progressive fine-tuning schedule.","pith_extraction_headline":"A progressive fine-tuning method lets language models internalize chain-of-thought steps so they can solve harder reasoning tasks without producing explicit intermediate outputs."},"references":{"count":20,"sample":[{"doi":"","year":2024,"title":"Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R","work_id":"50a8b109-5fb3-401e-8579-5d3b737fe859","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2018,"title":"On internal language representations in deep learning: An analysis of machine translation and speech recognition","work_id":"82043d1a-fbbe-4ba0-9d3f-7a4d2dc45c5f","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Beyond the imitation game: Quantifying and extrapolating the capabilities of language models","work_id":"7393fee7-41f4-49fd-8957-ecd752f4252b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-V oss, Gretch","work_id":"5aea4c3d-a3a0-44c3-9914-69a2e9f05e7e","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Training verifiers to solve math word problems","work_id":"1f7ff91e-3d16-4cba-ad18-3c6ae7ec674b","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":20,"snapshot_sha256":"58414645b799b721be73ccbdf2f5441e24fffd1c58cd4c4a11bbec2061668e1b","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"af91505ee37bdbe7c73c8a09730edfd984e0b121465aede3129f24673493eebc"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2405.14838","created_at":"2026-05-17T23:38:47.988126+00:00"},{"alias_kind":"arxiv_version","alias_value":"2405.14838v1","created_at":"2026-05-17T23:38:47.988126+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2405.14838","created_at":"2026-05-17T23:38:47.988126+00:00"},{"alias_kind":"pith_short_12","alias_value":"OXXTT4OHRETB","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"OXXTT4OHRETB3KWT","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"OXXTT4OH","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":28,"internal_anchor_count":28,"sample":[{"citing_arxiv_id":"2605.05997","citing_title":"4DThinker: Thinking with 4D Imagery for Dynamic Spatial Understanding","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12374","citing_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2503.11926","citing_title":"Monitoring Reasoning Models for Misbehavior and the Risks of Promoting Obfuscation","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26283","citing_title":"MedSynapse-V: Bridging Visual Perception and Clinical Intuition via Latent Memory Evolution","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16638","citing_title":"TTE-Flash: Accelerating Reasoning-based Multimodal Representations via Think-Then-Embed Tokens","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12374","citing_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2509.25020","citing_title":"Deep Thinking by Markov Chain of Continuous Thoughts","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2502.21074","citing_title":"CODI: Compressing Chain-of-Thought into Continuous Space via Self-Distillation","ref_index":89,"is_internal_anchor":true},{"citing_arxiv_id":"2412.13171","citing_title":"Compressed Chain of Thought: Efficient Reasoning Through Dense Representations","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10226","citing_title":"Latent Chain-of-Thought World Modeling for End-to-End Driving","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2503.16419","citing_title":"Stop Overthinking: A Survey on Efficient Reasoning for Large Language Models","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03144","citing_title":"InCoder-32B-Thinking: Industrial Code World Model for Thinking","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03679","citing_title":"LightThinker++: From Reasoning Compression to Memory Management","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12374","citing_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2502.05171","citing_title":"Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26355","citing_title":"Shorthand for Thought: Compressing LLM Reasoning via Entropy-Guided Supertokens","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26283","citing_title":"MedSynapse-V: Bridging Visual Perception and Clinical Intuition via Latent Memory Evolution","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2503.09567","citing_title":"Towards Reasoning Era: A Survey of Long Chain-of-Thought for Reasoning Large Language Models","ref_index":154,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18486","citing_title":"Xiaomi OneVL: One-Step Latent Reasoning and Planning with Vision-Language Explanation","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09346","citing_title":"RuPLaR : Efficient Latent Compression of LLM Reasoning Chains with Rule-Based Priors From Multi-Step to One-Step","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06165","citing_title":"Post Reasoning: Improving the Performance of Non-Thinking Models at No Cost","ref_index":75,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22951","citing_title":"The Power of Power Law: Asymmetry Enables Compositional Reasoning","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22709","citing_title":"Thinking Without Words: Efficient Latent Reasoning with Abstract Chain-of-Thought","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05997","citing_title":"4DThinker: Thinking with 4D Imagery for Dynamic Spatial Understanding","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2412.06769","citing_title":"Training Large Language Models to Reason in a Continuous Latent Space","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB","json":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB.json","graph_json":"https://pith.science/api/pith-number/OXXTT4OHRETB3KWT7POHNLCMVB/graph.json","events_json":"https://pith.science/api/pith-number/OXXTT4OHRETB3KWT7POHNLCMVB/events.json","paper":"https://pith.science/paper/OXXTT4OH"},"agent_actions":{"view_html":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB","download_json":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB.json","view_paper":"https://pith.science/paper/OXXTT4OH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2405.14838&json=true","fetch_graph":"https://pith.science/api/pith-number/OXXTT4OHRETB3KWT7POHNLCMVB/graph.json","fetch_events":"https://pith.science/api/pith-number/OXXTT4OHRETB3KWT7POHNLCMVB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB/action/storage_attestation","attest_author":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB/action/author_attestation","sign_citation":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB/action/citation_signature","submit_replication":"https://pith.science/pith/OXXTT4OHRETB3KWT7POHNLCMVB/action/replication_record"}},"created_at":"2026-05-17T23:38:47.988126+00:00","updated_at":"2026-05-17T23:38:47.988126+00:00"}