{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:RS6XGWCWJNK7PNWCHDEWVB4F5E","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5eb369bff6059693137a7555603f25b25cb1343bbf17526f994424c320d540f6","cross_cats_sorted":["stat.ML"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T23:32:00Z","title_canon_sha256":"ee0649dd4644fe53526dbe57ada7cc84065acbc9ac94140d0bbeea51224cadc8"},"schema_version":"1.0","source":{"id":"2605.14200","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14200","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14200v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14200","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"RS6XGWCWJNK7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"RS6XGWCWJNK7PNWC","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"RS6XGWCW","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:38837ada67f25fb5fb9d41c079ef0bc5755d7fe7285965b0f728a8507b5982b4","target":"graph","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments verify that MSSP robustly recovers learning rate transfer and monotonic improvement with scale across regimes. Combined with existing depth-scaling theory, these results provide a complete scaling prescription for MoE architectures as a function of width, depth, expert width, and number of experts."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The DMFT description of limiting training dynamics accurately captures the scale-dependent observables in the aggregation dynamics of MoE models in all three regimes, and that the maximal scale stability desiderata are the right refinement of muP."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"The authors derive a Maximally Scale-Stable Parameterization (MSSP) for MoE models that achieves robust learning-rate transfer and monotonic performance gains with scale across co-scaling regimes of width, experts, and sparsity."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Mixture-of-Experts models require a Maximally Scale-Stable Parameterization to restore learning-rate transfer and monotonic gains at scale."}],"snapshot_sha256":"df1ed8f1f7b36e9ce117ae9f7fd9c175ebc98dea20197fd95b39e369892f9100"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e6bb6c8203a02a28edbd308837a7eb343dbcfd2de8b3452e62a25eebdf5d3d0c"},"paper":{"abstract_excerpt":"Recent frontier large language models predominantly rely on Mixture-of-Experts (MoE) architectures. Despite empirical progress, there is still no principled understanding of how hyperparameters should scale with network width $N$, expert width $N_e$, number of experts $M$, sparsity $K$, and depth $L$ to ensure both stability and optimal performance at scale. We take a principled step toward resolving this gap by analyzing three different scaling regimes: (I) co-scaling $N\\asymp N_e$, (II) co-scaling $N\\asymp M\\asymp K$, and (III) full proportional scaling of $N, N_e, M$, and $K$. For each regi","authors_text":"Alessandro Breccia, Leena Chennuru Vankadara, Luke Hayward, Moritz Haas, Sebastian Bordt","cross_cats":["stat.ML"],"headline":"Mixture-of-Experts models require a Maximally Scale-Stable Parameterization to restore learning-rate transfer and monotonic gains at scale.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T23:32:00Z","title":"How to Scale Mixture-of-Experts: From muP to the Maximally Scale-Stable Parameterization"},"references":{"count":300,"internal_anchors":13,"resolved_work":300,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"arXiv preprint arXiv:2512.22768 , year=","work_id":"8057482a-2cb6-42e6-a46f-e6542c181011","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"CS 231N , volume=","work_id":"dd43a69f-c43e-4948-8ec1-87068c776a10","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Generalization and Scaling Laws for Mixture-of-Experts Transformers , author=. 2026 , note=","work_id":"6f68a207-1758-4f32-84da-3b1817876228","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"arXiv preprint arXiv:2407.04153 , year=","work_id":"edf386eb-cf36-47f2-b15f-fad92344ed8a","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"arXiv preprint arXiv:2402.07871 , year=","work_id":"e67733fa-7550-4e7a-b1e0-d65341a18264","year":null}],"snapshot_sha256":"a9c5da219652f2822c95c9c4ef54db647fa1c5c3d0b13e2b1be9e00dedeea2e5"},"source":{"id":"2605.14200","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T04:42:00.533798Z","id":"645bc396-a8cb-41b0-b98b-00ebd4de8ae5","model_set":{"reader":"grok-4.3"},"one_line_summary":"The authors derive a Maximally Scale-Stable Parameterization (MSSP) for MoE models that achieves robust learning-rate transfer and monotonic performance gains with scale across co-scaling regimes of width, experts, and sparsity.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Mixture-of-Experts models require a Maximally Scale-Stable Parameterization to restore learning-rate transfer and monotonic gains at scale.","strongest_claim":"Experiments verify that MSSP robustly recovers learning rate transfer and monotonic improvement with scale across regimes. Combined with existing depth-scaling theory, these results provide a complete scaling prescription for MoE architectures as a function of width, depth, expert width, and number of experts.","weakest_assumption":"The DMFT description of limiting training dynamics accurately captures the scale-dependent observables in the aggregation dynamics of MoE models in all three regimes, and that the maximal scale stability desiderata are the right refinement of muP."}},"verdict_id":"645bc396-a8cb-41b0-b98b-00ebd4de8ae5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2e64c917f45acd43bee87ba1e5d9c7ea2b84f3f52c5bfb8812c902e36be5bbb7","target":"record","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5eb369bff6059693137a7555603f25b25cb1343bbf17526f994424c320d540f6","cross_cats_sorted":["stat.ML"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T23:32:00Z","title_canon_sha256":"ee0649dd4644fe53526dbe57ada7cc84065acbc9ac94140d0bbeea51224cadc8"},"schema_version":"1.0","source":{"id":"2605.14200","kind":"arxiv","version":1}},"canonical_sha256":"8cbd7358564b55f7b6c238c96a8785e91f7f88c99329040b80b5cd2156f0acde","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"8cbd7358564b55f7b6c238c96a8785e91f7f88c99329040b80b5cd2156f0acde","first_computed_at":"2026-05-17T23:39:11.056526Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:11.056526Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"uwUz6YtA+A5FYX84XpLw8PDP46JRGosnTRuCsKQiO27ts8RxTT6ld7dpmkGHxmAeK72ftlzd+CNZ7iyT/XH7Cw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:11.057074Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14200","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2e64c917f45acd43bee87ba1e5d9c7ea2b84f3f52c5bfb8812c902e36be5bbb7","sha256:38837ada67f25fb5fb9d41c079ef0bc5755d7fe7285965b0f728a8507b5982b4"],"state_sha256":"dcb15e34ff1c1b4e047f10e94c5382827568bc06de05eb94efa3e83ca7201472"}