{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:IV7VH3NCTL2M6JAD44FIMISHCY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7f830fc5909f93eb47fc2eb578ac012f396b781c316285db184f08906136ac87","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T09:31:09Z","title_canon_sha256":"d6c6ab4c1005de22696dae73565f952d6f7bc1c26319ef73905320fe74a436d3"},"schema_version":"1.0","source":{"id":"2605.13247","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13247","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13247v2","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13247","created_at":"2026-05-18T02:44:49Z"},{"alias_kind":"pith_short_12","alias_value":"IV7VH3NCTL2M","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"IV7VH3NCTL2M6JAD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"IV7VH3NC","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8fb65b7ebe57637af57b0048bc1183cbb8f57fa10e66bffbd36875d24b1c96bb","target":"graph","created_at":"2026-05-18T02:44:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"EMO matches the performance of a fixed-expert setup in large-scale experiments while improving wall-clock efficiency."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Early-stage data may not fully utilize large expert capacity, and progressive expansion can be performed without performance loss by deriving stage-wise token budgets from sparsity in scaling laws."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"EMO progressively expands the expert pool in MoE models during training to match fixed-expert performance with improved wall-clock efficiency."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Progressive expansion of MoE expert pools matches fixed-expert performance while cutting training time."}],"snapshot_sha256":"468e8a05d061ded3690c82c6907a2d51f4c22c8a4c4f48702be6e390b47e2762"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"b77cfc1d34392b9c0af49d5a4abd06c8410648cd256b8fd8af8fb20f1029ca37"},"paper":{"abstract_excerpt":"Sparse Mixture-of-Experts (MoE) models offer a powerful way to scale model size without increasing compute, as per-token FLOPs depend only on k active experts rather than the total pool of E experts. Yet, this asymmetry creates an MoE efficiency paradox in practice: adding more experts balloons memory and communication costs, making actual training inefficient. We argue that this bottleneck arises in part because current MoE training allocates too many experts from the beginning, even though early-stage data may not fully utilize such capacity. Motivated by this, we propose EMO, a simple progr","authors_text":"Chufan Shi, Eric Xing, Huijuan Wang, Linghao Jin, Nuan Wen, Xuezhe Ma, Zhengzhong Liu","cross_cats":[],"headline":"Progressive expansion of MoE expert pools matches fixed-expert performance while cutting training time.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T09:31:09Z","title":"EMO: Frustratingly Easy Progressive Training of Extendable MoE"},"references":{"count":15,"internal_anchors":8,"resolved_work":15,"sample":[{"cited_arxiv_id":"1308.3432","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation","work_id":"1fe8c7c8-aff7-4b94-9096-e549d7e60789","year":null},{"cited_arxiv_id":"2110.14168","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","year":null},{"cited_arxiv_id":"2412.19437","doi":"","is_internal_anchor":true,"ref_index":3,"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"The Language Model Evaluation Harness,","work_id":"e7e9d443-273d-4722-8ec7-59adb893de0e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Fast- moe: A fast mixture-of-expert training system","work_id":"6f8a9c7a-6412-48c3-ae67-7c51432b458a","year":null}],"snapshot_sha256":"72cab19f095400f1aff0abf81ac5342c50b7206b05027544d8eda63b17764a02"},"source":{"id":"2605.13247","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T05:27:29.556590Z","id":"209e5fcb-619f-455b-80b5-04d342fdba8e","model_set":{"reader":"grok-4.3"},"one_line_summary":"EMO progressively expands the expert pool in MoE models during training to match fixed-expert performance with improved wall-clock efficiency.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Progressive expansion of MoE expert pools matches fixed-expert performance while cutting training time.","strongest_claim":"EMO matches the performance of a fixed-expert setup in large-scale experiments while improving wall-clock efficiency.","weakest_assumption":"Early-stage data may not fully utilize large expert capacity, and progressive expansion can be performed without performance loss by deriving stage-wise token budgets from sparsity in scaling laws."}},"verdict_id":"209e5fcb-619f-455b-80b5-04d342fdba8e"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:23cf76fa60d4d1ce7b359acbbcf39d6de492ba712754b25f8dfcf9d43bd52861","target":"record","created_at":"2026-05-18T02:44:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7f830fc5909f93eb47fc2eb578ac012f396b781c316285db184f08906136ac87","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T09:31:09Z","title_canon_sha256":"d6c6ab4c1005de22696dae73565f952d6f7bc1c26319ef73905320fe74a436d3"},"schema_version":"1.0","source":{"id":"2605.13247","kind":"arxiv","version":2}},"canonical_sha256":"457f53eda29af4cf2403e70a86224716237debbe07672b45d6b1ba5d65ccea01","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"457f53eda29af4cf2403e70a86224716237debbe07672b45d6b1ba5d65ccea01","first_computed_at":"2026-05-18T02:44:49.458772Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:49.458772Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/mPcjUO28Ham5jFTDxI7nNj5TfN0lWCx8Z8+zmqdgo7gkQvNL/n1luB3AHFT3kBPgSv5eg9GEDSveya/IcXxCw==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:49.459248Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13247","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:23cf76fa60d4d1ce7b359acbbcf39d6de492ba712754b25f8dfcf9d43bd52861","sha256:8fb65b7ebe57637af57b0048bc1183cbb8f57fa10e66bffbd36875d24b1c96bb"],"state_sha256":"9a675b3ee3dadb9be8a6b0c6dd9d8042ef85c927da07a82820d6941e750b862d"}