{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:DV3RCMSGJPFXENWOSOFM5K7UEV","short_pith_number":"pith:DV3RCMSG","schema_version":"1.0","canonical_sha256":"1d771132464bcb7236ce938aceabf425740033f551ee3820ccbd0bc4cc2918e9","source":{"kind":"arxiv","id":"2601.20205","version":3},"attestation_state":"computed","paper":{"title":"Hyperparameter Transfer with Mixture-of-Expert Layers","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Blake Bordelon, Boris Hanin, Cengiz Pehlevan, Tianze Jiang","submitted_at":"2026-01-28T03:02:30Z","abstract_excerpt":"Mixture-of-Experts (MoE) layers have emerged as an important tool in scaling up modern neural networks by decoupling total trainable parameters from activated parameters in the forward pass for each token. However, sparse MoEs add complexity to training due to (i) new trainable parameters (router weights) that, like all other parameter groups, require hyperparameter (HP) tuning; (ii) new architecture scale dimensions (number of and size of experts) that must be chosen and potentially taken large. To make HP selection cheap and reliable, we propose a new parameterization for transformer models "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.20205","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-28T03:02:30Z","cross_cats_sorted":[],"title_canon_sha256":"03516cb2a06e1efcf98f2755ddb6a1be3492a907bd02395ab50aabca998a1df4","abstract_canon_sha256":"dd4e3931644a53159d8803c5c0821ad39c48fd24b49896b0a3680f09928cc352"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T02:04:39.100897Z","signature_b64":"ndWKhqLTTuMpjVsqb0IXdL4PIWmXEuYPhNfBx9X7MSUsTpLjQwlTGTtBCzgYLHzvcUzjzLoYQoJeKtI8AmvIDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1d771132464bcb7236ce938aceabf425740033f551ee3820ccbd0bc4cc2918e9","last_reissued_at":"2026-05-22T02:04:39.100008Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T02:04:39.100008Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Hyperparameter Transfer with Mixture-of-Expert Layers","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Blake Bordelon, Boris Hanin, Cengiz Pehlevan, Tianze Jiang","submitted_at":"2026-01-28T03:02:30Z","abstract_excerpt":"Mixture-of-Experts (MoE) layers have emerged as an important tool in scaling up modern neural networks by decoupling total trainable parameters from activated parameters in the forward pass for each token. However, sparse MoEs add complexity to training due to (i) new trainable parameters (router weights) that, like all other parameter groups, require hyperparameter (HP) tuning; (ii) new architecture scale dimensions (number of and size of experts) that must be chosen and potentially taken large. To make HP selection cheap and reliable, we propose a new parameterization for transformer models "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.20205","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.20205/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.20205","created_at":"2026-05-22T02:04:39.100161+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.20205v3","created_at":"2026-05-22T02:04:39.100161+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.20205","created_at":"2026-05-22T02:04:39.100161+00:00"},{"alias_kind":"pith_short_12","alias_value":"DV3RCMSGJPFX","created_at":"2026-05-22T02:04:39.100161+00:00"},{"alias_kind":"pith_short_16","alias_value":"DV3RCMSGJPFXENWO","created_at":"2026-05-22T02:04:39.100161+00:00"},{"alias_kind":"pith_short_8","alias_value":"DV3RCMSG","created_at":"2026-05-22T02:04:39.100161+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.07870","citing_title":"Spectral Dynamics in Deep Networks: Feature Learning, Outlier Escape, and Learning Rate Transfer","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21486","citing_title":"Quantifying Hyperparameter Transfer and the Importance of Embedding Layer Learning Rate","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14200","citing_title":"How to Scale Mixture-of-Experts: From muP to the Maximally Scale-Stable Parameterization","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10164","citing_title":"Hyperparameter Transfer for Dense Associative Memories","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21691","citing_title":"There Will Be a Scientific Theory of Deep Learning","ref_index":232,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07870","citing_title":"Spectral Dynamics in Deep Networks: Feature Learning, Outlier Escape, and Learning Rate Transfer","ref_index":50,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV","json":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV.json","graph_json":"https://pith.science/api/pith-number/DV3RCMSGJPFXENWOSOFM5K7UEV/graph.json","events_json":"https://pith.science/api/pith-number/DV3RCMSGJPFXENWOSOFM5K7UEV/events.json","paper":"https://pith.science/paper/DV3RCMSG"},"agent_actions":{"view_html":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV","download_json":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV.json","view_paper":"https://pith.science/paper/DV3RCMSG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.20205&json=true","fetch_graph":"https://pith.science/api/pith-number/DV3RCMSGJPFXENWOSOFM5K7UEV/graph.json","fetch_events":"https://pith.science/api/pith-number/DV3RCMSGJPFXENWOSOFM5K7UEV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV/action/storage_attestation","attest_author":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV/action/author_attestation","sign_citation":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV/action/citation_signature","submit_replication":"https://pith.science/pith/DV3RCMSGJPFXENWOSOFM5K7UEV/action/replication_record"}},"created_at":"2026-05-22T02:04:39.100161+00:00","updated_at":"2026-05-22T02:04:39.100161+00:00"}