{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:VOBZ6XVMPLQ5NHVQETLPTITE5J","short_pith_number":"pith:VOBZ6XVM","schema_version":"1.0","canonical_sha256":"ab839f5eac7ae1d69eb024d6f9a264ea4db9f6e1834fb0df4691967aa3e1849d","source":{"kind":"arxiv","id":"2510.08008","version":2},"attestation_state":"computed","paper":{"title":"Beyond Sunk Costs: Boosting LLM Pre-training Efficiency via Orthogonal Growth of Mixture-of-Experts","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Baining Guo, Peng Cheng, Ruizhe Wang, Xiao Liu, Yaoxiang Wang, Yeyun Gong, Yucheng Ding, Zhengjun Zha","submitted_at":"2025-10-09T09:45:45Z","abstract_excerpt":"As the computational demands for pre-training Large Language Models (LLMs) continue to surge, the need for efficient training paradigms becomes critical. Despite the vast resources already invested in existing pre-trained checkpoints, these assets often remain under-leveraged due to architectural limitations. We introduce an \"orthogonal growth\" strategy designed to \"recycle\" these checkpoints by strategically expanding their parameters prior to continued training. Our method focuses on optimizing converged Mixture-of-Experts (MoE) models through two dimensions: interpositional layer copying fo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.08008","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-09T09:45:45Z","cross_cats_sorted":[],"title_canon_sha256":"78b6ce7041af55b01f15944905a04a526c2a853e409b2dd55bf172e10cc6e550","abstract_canon_sha256":"91a1b17aef4a845bf6f591a749311aed61eede1d8b2f756e925c2caf8b37905f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:25.660718Z","signature_b64":"lWA3VLmlJu6Pbv2atobf3ZuUT4QksZxxcoDwWmbbBGybpP/s2pQ7VovIUM+utgrZPi5wluEfDkvsAF4IgpEzAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ab839f5eac7ae1d69eb024d6f9a264ea4db9f6e1834fb0df4691967aa3e1849d","last_reissued_at":"2026-05-20T00:00:25.659817Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:25.659817Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Sunk Costs: Boosting LLM Pre-training Efficiency via Orthogonal Growth of Mixture-of-Experts","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Baining Guo, Peng Cheng, Ruizhe Wang, Xiao Liu, Yaoxiang Wang, Yeyun Gong, Yucheng Ding, Zhengjun Zha","submitted_at":"2025-10-09T09:45:45Z","abstract_excerpt":"As the computational demands for pre-training Large Language Models (LLMs) continue to surge, the need for efficient training paradigms becomes critical. Despite the vast resources already invested in existing pre-trained checkpoints, these assets often remain under-leveraged due to architectural limitations. We introduce an \"orthogonal growth\" strategy designed to \"recycle\" these checkpoints by strategically expanding their parameters prior to continued training. Our method focuses on optimizing converged Mixture-of-Experts (MoE) models through two dimensions: interpositional layer copying fo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.08008","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.08008/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.08008","created_at":"2026-05-20T00:00:25.659978+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.08008v2","created_at":"2026-05-20T00:00:25.659978+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.08008","created_at":"2026-05-20T00:00:25.659978+00:00"},{"alias_kind":"pith_short_12","alias_value":"VOBZ6XVMPLQ5","created_at":"2026-05-20T00:00:25.659978+00:00"},{"alias_kind":"pith_short_16","alias_value":"VOBZ6XVMPLQ5NHVQ","created_at":"2026-05-20T00:00:25.659978+00:00"},{"alias_kind":"pith_short_8","alias_value":"VOBZ6XVM","created_at":"2026-05-20T00:00:25.659978+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J","json":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J.json","graph_json":"https://pith.science/api/pith-number/VOBZ6XVMPLQ5NHVQETLPTITE5J/graph.json","events_json":"https://pith.science/api/pith-number/VOBZ6XVMPLQ5NHVQETLPTITE5J/events.json","paper":"https://pith.science/paper/VOBZ6XVM"},"agent_actions":{"view_html":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J","download_json":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J.json","view_paper":"https://pith.science/paper/VOBZ6XVM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.08008&json=true","fetch_graph":"https://pith.science/api/pith-number/VOBZ6XVMPLQ5NHVQETLPTITE5J/graph.json","fetch_events":"https://pith.science/api/pith-number/VOBZ6XVMPLQ5NHVQETLPTITE5J/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J/action/storage_attestation","attest_author":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J/action/author_attestation","sign_citation":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J/action/citation_signature","submit_replication":"https://pith.science/pith/VOBZ6XVMPLQ5NHVQETLPTITE5J/action/replication_record"}},"created_at":"2026-05-20T00:00:25.659978+00:00","updated_at":"2026-05-20T00:00:25.659978+00:00"}