{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:OLBY5FDUE5HIQNCKA3E2AWZIIR","short_pith_number":"pith:OLBY5FDU","schema_version":"1.0","canonical_sha256":"72c38e9474274e88344a06c9a05b28446923da29442b1a19d42667b405a9396f","source":{"kind":"arxiv","id":"2502.21074","version":3},"attestation_state":"computed","paper":{"title":"CODI: Compressing Chain-of-Thought into Continuous Space via Self-Distillation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Self-distillation aligns one token's hidden state to transfer chain-of-thought reasoning into continuous space without accuracy loss.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Hanqi Yan, Linhai Zhang, Yali Du, Yulan He, Zhanghao Hu, Zhenyi Shen","submitted_at":"2025-02-28T14:07:48Z","abstract_excerpt":"Chain-of-Thought (CoT) reasoning enhances Large Language Models (LLMs) by encouraging step-by-step reasoning in natural language. However, leveraging a latent continuous space for reasoning may offer benefits in terms of both efficiency and robustness. Prior implicit CoT methods attempt to bypass language completely by reasoning in continuous space but have consistently underperformed compared to the standard explicit CoT approach. We introduce CODI (Continuous Chain-of-Thought via Self-Distillation), a novel training framework that effectively compresses natural language CoT into continuous s"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2502.21074","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2025-02-28T14:07:48Z","cross_cats_sorted":[],"title_canon_sha256":"fec4a20ac60bf55293b6e3d5812e45d9bb1ad1127781d285889ad7f64202510a","abstract_canon_sha256":"b439973371a7088a8c826d686f23ee531eed1627abe2f31744fa6508a976f03b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:12.665240Z","signature_b64":"oKGT2wVMLnwMThBdKAtaRxTTzgLhTAdhD/4eZ5wqanW0vBXmg/VkIHk8kqWk54cVGJmvqEJFwS/7o8Ki9qX4AA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"72c38e9474274e88344a06c9a05b28446923da29442b1a19d42667b405a9396f","last_reissued_at":"2026-05-17T23:38:12.664428Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:12.664428Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"CODI: Compressing Chain-of-Thought into Continuous Space via Self-Distillation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Self-distillation aligns one token's hidden state to transfer chain-of-thought reasoning into continuous space without accuracy loss.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Hanqi Yan, Linhai Zhang, Yali Du, Yulan He, Zhanghao Hu, Zhenyi Shen","submitted_at":"2025-02-28T14:07:48Z","abstract_excerpt":"Chain-of-Thought (CoT) reasoning enhances Large Language Models (LLMs) by encouraging step-by-step reasoning in natural language. However, leveraging a latent continuous space for reasoning may offer benefits in terms of both efficiency and robustness. Prior implicit CoT methods attempt to bypass language completely by reasoning in continuous space but have consistently underperformed compared to the standard explicit CoT approach. We introduce CODI (Continuous Chain-of-Thought via Self-Distillation), a novel training framework that effectively compresses natural language CoT into continuous s"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"CODI is the first implicit CoT approach to match the performance of explicit CoT on GSM8k at the GPT-2 scale, achieving a 3.1x compression rate and outperforming the previous state-of-the-art by 28.2% in accuracy.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That aligning the hidden states of a single designated token between the explicit teacher and implicit student is sufficient to transfer full reasoning capability without loss or distortion.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"CODI compresses explicit CoT into continuous space via self-distillation and is the first implicit method to match explicit CoT performance on GSM8k at GPT-2 scale with 3.1x compression and 28.2% higher accuracy than prior implicit approaches.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Self-distillation aligns one token's hidden state to transfer chain-of-thought reasoning into continuous space without accuracy loss.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c151f3739eb36f6518d67f76896ef165a11c14218227899eb7a34ea3c42e936b"},"source":{"id":"2502.21074","kind":"arxiv","version":3},"verdict":{"id":"2a28c913-7259-4a74-ac35-712fd648cffd","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T23:11:08.279207Z","strongest_claim":"CODI is the first implicit CoT approach to match the performance of explicit CoT on GSM8k at the GPT-2 scale, achieving a 3.1x compression rate and outperforming the previous state-of-the-art by 28.2% in accuracy.","one_line_summary":"CODI compresses explicit CoT into continuous space via self-distillation and is the first implicit method to match explicit CoT performance on GSM8k at GPT-2 scale with 3.1x compression and 28.2% higher accuracy than prior implicit approaches.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That aligning the hidden states of a single designated token between the explicit teacher and implicit student is sufficient to transfer full reasoning capability without loss or distortion.","pith_extraction_headline":"Self-distillation aligns one token's hidden state to transfer chain-of-thought reasoning into continuous space without accuracy loss."},"references":{"count":118,"sample":[{"doi":"","year":null,"title":"Training language models to follow instructions with human feedback , url =","work_id":"04f44026-8d59-44c7-9381-031151587087","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1972,"title":"Aho and Jeffrey D","work_id":"b1f5cb43-a3c7-4ea0-85e7-9ccc9dfe1588","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1983,"title":"Publications Manual , year = \"1983\", publisher =","work_id":"aca2b566-99e0-4ebb-9c7a-a81219531259","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1145/322234.322243","year":1981,"title":"Chandra and Dexter C","work_id":"c3270592-bd69-4213-95e1-4aaf8312be9b","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Scalable training of","work_id":"aef70eae-f816-4598-84ec-429a2c09f5fc","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":118,"snapshot_sha256":"61509a0e5398a54ba1b20dd6a610a19ad1c485f388ec00878d332ff503f9f93e","internal_anchors":7},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.21074","created_at":"2026-05-17T23:38:12.664537+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.21074v3","created_at":"2026-05-17T23:38:12.664537+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.21074","created_at":"2026-05-17T23:38:12.664537+00:00"},{"alias_kind":"pith_short_12","alias_value":"OLBY5FDUE5HI","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"OLBY5FDUE5HIQNCK","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"OLBY5FDU","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2508.01191","citing_title":"Is Chain-of-Thought Reasoning of LLMs a Mirage? A Data Distribution Lens","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2509.25020","citing_title":"Deep Thinking by Markov Chain of Continuous Thoughts","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2510.03206","citing_title":"Coevolutionary Continuous Discrete Diffusion: Make Your Diffusion Language Model a Latent Reasoner","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2511.08983","citing_title":"SpiralThinker: Latent Reasoning through an Iterative Process with Text-Latent Interleaving","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10941","citing_title":"Mull-Tokens: Modality-Agnostic Latent Thinking","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2603.08899","citing_title":"ConFu: Contemplate the Future for Better Speculative Sampling","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2603.17837","citing_title":"The Silent Thought: Modeling Internal Cognition in Full-Duplex Spoken Dialogue Models via Latent Reasoning","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2603.24422","citing_title":"OneSearch-V2: The Latent Reasoning Enhanced Self-distillation Generative Search Framework","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13165","citing_title":"STOP: Structured On-Policy Pruning of Long-Form Reasoning in Low-Data Regimes","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2503.16419","citing_title":"Stop Overthinking: A Survey on Efficient Reasoning for Large Language Models","ref_index":155,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02073","citing_title":"PLUME: Latent Reasoning Based Universal Multimodal Embedding","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03809","citing_title":"Representational Collapse in Multi-Agent LLM Committees: Measurement and Diversity-Aware Consensus","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10500","citing_title":"Visual Enhanced Depth Scaling for Multimodal Latent Reasoning","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10500","citing_title":"Visual Enhanced Depth Scaling for Multimodal Latent Reasoning","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06165","citing_title":"Post Reasoning: Improving the Performance of Non-Thinking Models at No Cost","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01111","citing_title":"When Less is Enough: Efficient Inference via Collaborative Reasoning","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10500","citing_title":"Visual Enhanced Depth Scaling for Multimodal Latent Reasoning","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09757","citing_title":"MedLVR: Latent Visual Reasoning for Reliable Medical Visual Question Answering","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06377","citing_title":"The Master Key Hypothesis: Unlocking Cross-Model Capability Transfer via Linear Subspace Alignment","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14889","citing_title":"MemoSight: Unifying Context Compression and Multi Token Prediction for Reasoning Acceleration","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17892","citing_title":"LEPO: Latent Reasoning Policy Optimization for Large Language Models","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21027","citing_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","ref_index":107,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR","json":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR.json","graph_json":"https://pith.science/api/pith-number/OLBY5FDUE5HIQNCKA3E2AWZIIR/graph.json","events_json":"https://pith.science/api/pith-number/OLBY5FDUE5HIQNCKA3E2AWZIIR/events.json","paper":"https://pith.science/paper/OLBY5FDU"},"agent_actions":{"view_html":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR","download_json":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR.json","view_paper":"https://pith.science/paper/OLBY5FDU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.21074&json=true","fetch_graph":"https://pith.science/api/pith-number/OLBY5FDUE5HIQNCKA3E2AWZIIR/graph.json","fetch_events":"https://pith.science/api/pith-number/OLBY5FDUE5HIQNCKA3E2AWZIIR/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR/action/storage_attestation","attest_author":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR/action/author_attestation","sign_citation":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR/action/citation_signature","submit_replication":"https://pith.science/pith/OLBY5FDUE5HIQNCKA3E2AWZIIR/action/replication_record"}},"created_at":"2026-05-17T23:38:12.664537+00:00","updated_at":"2026-05-17T23:38:12.664537+00:00"}