{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:NMB7QKC2NJYNHLPG6XMF2N35IK","short_pith_number":"pith:NMB7QKC2","schema_version":"1.0","canonical_sha256":"6b03f8285a6a70d3ade6f5d85d377d42a69898557b3236c98206a6751a356b42","source":{"kind":"arxiv","id":"2109.10465","version":1},"attestation_state":"computed","paper":{"title":"Scalable and Efficient MoE Training for Multitask Multilingual Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Alexandre Muzio, Ammar Ahmad Awan, Amr Hendy, Andres Felipe Cruz Salinas, Hany Hassan Awadalla, Liyang Lu, Samyam Rajbhandari, Young Jin Kim, Yuxiong He","submitted_at":"2021-09-22T00:57:46Z","abstract_excerpt":"The Mixture of Experts (MoE) models are an emerging class of sparsely activated deep learning models that have sublinear compute costs with respect to their parameters. In contrast with dense models, the sparse architecture of MoE offers opportunities for drastically growing model size with significant accuracy gain while consuming much lower compute budget. However, supporting large scale MoE training also has its own set of system and modeling challenges. To overcome the challenges and embrace the opportunities of MoE, we first develop a system capable of scaling MoE models efficiently to tr"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2109.10465","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-09-22T00:57:46Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"1081b2af858259c4909b86775021934b23d9342c1faf94b76a10c63484a84730","abstract_canon_sha256":"dcac02ae0b4559f7f4f48d8f5fa767bd4fe97a09a813604072a4c3e888cdbdad"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T03:16:36.497693Z","signature_b64":"9laqC697yEiudQfsgOXGMS9JdQ6p64rv25pgJF7NefPSl/e8AZRzyfN+UYwW705+vFIN3LktzSQEJXvfMrWlDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6b03f8285a6a70d3ade6f5d85d377d42a69898557b3236c98206a6751a356b42","last_reissued_at":"2026-07-05T03:16:36.497283Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T03:16:36.497283Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Scalable and Efficient MoE Training for Multitask Multilingual Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Alexandre Muzio, Ammar Ahmad Awan, Amr Hendy, Andres Felipe Cruz Salinas, Hany Hassan Awadalla, Liyang Lu, Samyam Rajbhandari, Young Jin Kim, Yuxiong He","submitted_at":"2021-09-22T00:57:46Z","abstract_excerpt":"The Mixture of Experts (MoE) models are an emerging class of sparsely activated deep learning models that have sublinear compute costs with respect to their parameters. In contrast with dense models, the sparse architecture of MoE offers opportunities for drastically growing model size with significant accuracy gain while consuming much lower compute budget. However, supporting large scale MoE training also has its own set of system and modeling challenges. To overcome the challenges and embrace the opportunities of MoE, we first develop a system capable of scaling MoE models efficiently to tr"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2109.10465","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2109.10465/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2109.10465","created_at":"2026-07-05T03:16:36.497342+00:00"},{"alias_kind":"arxiv_version","alias_value":"2109.10465v1","created_at":"2026-07-05T03:16:36.497342+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2109.10465","created_at":"2026-07-05T03:16:36.497342+00:00"},{"alias_kind":"pith_short_12","alias_value":"NMB7QKC2NJYN","created_at":"2026-07-05T03:16:36.497342+00:00"},{"alias_kind":"pith_short_16","alias_value":"NMB7QKC2NJYNHLPG","created_at":"2026-07-05T03:16:36.497342+00:00"},{"alias_kind":"pith_short_8","alias_value":"NMB7QKC2","created_at":"2026-07-05T03:16:36.497342+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.23546","citing_title":"The Energy Consumption of Transformer Fine-Tuning: A Roofline-Inspired Scaling Model","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"2607.01444","citing_title":"On the Utility and Factual Reliability of Pruned Mixture-of-Experts Models in the Biomedical Domain","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2607.01789","citing_title":"EPnG: Adaptive Expert Prune-and-Grow for Parameter-Efficient MoE Fine-tuning","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2309.14509","citing_title":"DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models","ref_index":171,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK","json":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK.json","graph_json":"https://pith.science/api/pith-number/NMB7QKC2NJYNHLPG6XMF2N35IK/graph.json","events_json":"https://pith.science/api/pith-number/NMB7QKC2NJYNHLPG6XMF2N35IK/events.json","paper":"https://pith.science/paper/NMB7QKC2"},"agent_actions":{"view_html":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK","download_json":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK.json","view_paper":"https://pith.science/paper/NMB7QKC2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2109.10465&json=true","fetch_graph":"https://pith.science/api/pith-number/NMB7QKC2NJYNHLPG6XMF2N35IK/graph.json","fetch_events":"https://pith.science/api/pith-number/NMB7QKC2NJYNHLPG6XMF2N35IK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK/action/storage_attestation","attest_author":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK/action/author_attestation","sign_citation":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK/action/citation_signature","submit_replication":"https://pith.science/pith/NMB7QKC2NJYNHLPG6XMF2N35IK/action/replication_record"}},"created_at":"2026-07-05T03:16:36.497342+00:00","updated_at":"2026-07-05T03:16:36.497342+00:00"}