{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2022:OVM55B4OOULKBSVVQ6PUHVEWGS","short_pith_number":"pith:OVM55B4O","schema_version":"1.0","canonical_sha256":"7559de878e7516a0cab5879f43d4963484951c89e0c5e69edc2dc02b28beb2f8","source":{"kind":"arxiv","id":"2201.11990","version":3},"attestation_state":"computed","paper":{"title":"Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Brandon Norick, Bryan Catanzaro, Elton Zhang, George Zerveas, Jared Casper, Julie Bernauer, Michael Houston, Mohammad Shoeybi, Mostofa Patwary, Patrick LeGresley, Rewon Child, Reza Yazdani Aminabadi, Samyam Rajbhandari, Saurabh Tiwary, Shaden Smith, Shrimai Prabhumoye, Vijay Korthikanti, Xia Song, Yuxiong He, Zhun Liu","submitted_at":"2022-01-28T08:59:57Z","abstract_excerpt":"Pretrained general-purpose language models can achieve state-of-the-art accuracies in various natural language processing domains by adapting to downstream tasks via zero-shot, few-shot and fine-tuning techniques. Because of their success, the size of these models has increased rapidly, requiring high-performance hardware, software, and algorithmic techniques to enable training such large models. As the result of a joint effort between Microsoft and NVIDIA, we present details on the training of the largest monolithic transformer based language model, Megatron-Turing NLG 530B (MT-NLG), with 530"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2201.11990","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-01-28T08:59:57Z","cross_cats_sorted":[],"title_canon_sha256":"a8821981d6c4bdeb44cb3a21db1640b9d268b26ece0652e89e6f79292718e7ea","abstract_canon_sha256":"86a01b263314585c601bafccee12ae6cc73f8e9eb8f3276c5c82db667542b812"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:22.070115Z","signature_b64":"a5b4qc+Rck0Q+jV2V0gXynHFDdKqpw8JI/u1qJ6KnGuxytQfbB8GOlF9DHa5jhHaanbz4xLGp8Vj76/LJYFFBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7559de878e7516a0cab5879f43d4963484951c89e0c5e69edc2dc02b28beb2f8","last_reissued_at":"2026-05-17T23:39:22.069555Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:22.069555Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Brandon Norick, Bryan Catanzaro, Elton Zhang, George Zerveas, Jared Casper, Julie Bernauer, Michael Houston, Mohammad Shoeybi, Mostofa Patwary, Patrick LeGresley, Rewon Child, Reza Yazdani Aminabadi, Samyam Rajbhandari, Saurabh Tiwary, Shaden Smith, Shrimai Prabhumoye, Vijay Korthikanti, Xia Song, Yuxiong He, Zhun Liu","submitted_at":"2022-01-28T08:59:57Z","abstract_excerpt":"Pretrained general-purpose language models can achieve state-of-the-art accuracies in various natural language processing domains by adapting to downstream tasks via zero-shot, few-shot and fine-tuning techniques. Because of their success, the size of these models has increased rapidly, requiring high-performance hardware, software, and algorithmic techniques to enable training such large models. As the result of a joint effort between Microsoft and NVIDIA, we present details on the training of the largest monolithic transformer based language model, Megatron-Turing NLG 530B (MT-NLG), with 530"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2201.11990","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2201.11990","created_at":"2026-05-17T23:39:22.069652+00:00"},{"alias_kind":"arxiv_version","alias_value":"2201.11990v3","created_at":"2026-05-17T23:39:22.069652+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2201.11990","created_at":"2026-05-17T23:39:22.069652+00:00"},{"alias_kind":"pith_short_12","alias_value":"OVM55B4OOULK","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"OVM55B4OOULKBSVV","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"OVM55B4O","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":42,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2204.06745","citing_title":"GPT-NeoX-20B: An Open-Source Autoregressive Language Model","ref_index":87,"is_internal_anchor":true},{"citing_arxiv_id":"2501.01046","citing_title":"SEDD: Scalable and Efficient Dataset Deduplication with GPUs","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2305.02301","citing_title":"Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes","ref_index":98,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17164","citing_title":"Charon: A Unified and Fine-Grained Simulator for Large-Scale LLM Training and Inference","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06435","citing_title":"A Comprehensive Overview of Large Language Models","ref_index":117,"is_internal_anchor":true},{"citing_arxiv_id":"2507.00432","citing_title":"Does Math Reasoning Improve General LLM Capabilities? Understanding Transferability of LLM Reasoning","ref_index":247,"is_internal_anchor":true},{"citing_arxiv_id":"2508.21613","citing_title":"Chameleon: Adaptive Fault Tolerance for Distributed Training via Real-time Policy Selection","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2509.07177","citing_title":"Towards EnergyGPT: A Large Language Model Specialized for the Energy Sector","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2305.16264","citing_title":"Scaling Data-Constrained Language Models","ref_index":107,"is_internal_anchor":true},{"citing_arxiv_id":"2304.06767","citing_title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","ref_index":112,"is_internal_anchor":true},{"citing_arxiv_id":"2207.14255","citing_title":"Efficient Training of Language Models to Fill in the Middle","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2303.17491","citing_title":"Language Models can Solve Computer Tasks","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2208.03299","citing_title":"Atlas: Few-shot Learning with Retrieval Augmented Language Models","ref_index":249,"is_internal_anchor":true},{"citing_arxiv_id":"2208.03299","citing_title":"Atlas: Few-shot Learning with Retrieval Augmented Language Models","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2311.16867","citing_title":"The Falcon Series of Open Language Models","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2310.09478","citing_title":"MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2501.08313","citing_title":"MiniMax-01: Scaling Foundation Models with Lightning Attention","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2602.22437","citing_title":"veScale-FSDP: Flexible and High-Performance FSDP at Scale","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2304.01373","citing_title":"Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling","ref_index":113,"is_internal_anchor":true},{"citing_arxiv_id":"2603.14360","citing_title":"M$^2$RNN: Non-Linear RNNs with Matrix-Valued States for Scalable Language Modeling","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2209.05433","citing_title":"FP8 Formats for Deep Learning","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2205.00445","citing_title":"MRKL Systems: A modular, neuro-symbolic architecture that combines large language models, external knowledge sources and discrete reasoning","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2303.17564","citing_title":"BloombergGPT: A Large Language Model for Finance","ref_index":105,"is_internal_anchor":false},{"citing_arxiv_id":"2604.02473","citing_title":"Analyzing Reverse Address Translation Overheads in Multi-GPU Scale-Up Pods","ref_index":99,"is_internal_anchor":false},{"citing_arxiv_id":"2412.21187","citing_title":"Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs","ref_index":115,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS","json":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS.json","graph_json":"https://pith.science/api/pith-number/OVM55B4OOULKBSVVQ6PUHVEWGS/graph.json","events_json":"https://pith.science/api/pith-number/OVM55B4OOULKBSVVQ6PUHVEWGS/events.json","paper":"https://pith.science/paper/OVM55B4O"},"agent_actions":{"view_html":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS","download_json":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS.json","view_paper":"https://pith.science/paper/OVM55B4O","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2201.11990&json=true","fetch_graph":"https://pith.science/api/pith-number/OVM55B4OOULKBSVVQ6PUHVEWGS/graph.json","fetch_events":"https://pith.science/api/pith-number/OVM55B4OOULKBSVVQ6PUHVEWGS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS/action/storage_attestation","attest_author":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS/action/author_attestation","sign_citation":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS/action/citation_signature","submit_replication":"https://pith.science/pith/OVM55B4OOULKBSVVQ6PUHVEWGS/action/replication_record"}},"created_at":"2026-05-17T23:39:22.069652+00:00","updated_at":"2026-05-17T23:39:22.069652+00:00"}