{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2019:LG76FO4FXIRQAKPLNJWYADOHLW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"769410855d6e6defbf18a87865b61cd2c4373b74c87a93f622ec300280dd1a77","cross_cats_sorted":["cs.DC","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-10-04T17:29:39Z","title_canon_sha256":"5c51bb8d9d15dc00904edb477c9632c6ae88312b10fbfa1a9d71978551cf7643"},"schema_version":"1.0","source":{"id":"1910.02054","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1910.02054","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"1910.02054v3","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1910.02054","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"LG76FO4FXIRQ","created_at":"2026-05-18T12:33:21Z"},{"alias_kind":"pith_short_16","alias_value":"LG76FO4FXIRQAKPL","created_at":"2026-05-18T12:33:21Z"},{"alias_kind":"pith_short_8","alias_value":"LG76FO4F","created_at":"2026-05-18T12:33:21Z"}],"graph_snapshots":[{"event_id":"sha256:c319e829a4037356c9248e48bd7c3a58f9c2c164222e388885e96e7e16fdca14","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"ZeRO eliminates memory redundancies in data- and model-parallel training while retaining low communication volume and high computational granularity, allowing us to scale the model size proportional to the number of devices with sustained high efficiency. Our analysis demonstrates ZeRO has the potential to scale beyond 1 Trillion parameters using today's hardware."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that partitioning optimizer states and gradients will not introduce new communication bottlenecks or synchronization overheads that scale worse than linearly when moving to thousands of devices."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ZeRO removes memory redundancies in parallel training to scale deep learning models to over a trillion parameters with high throughput on current hardware."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"ZeRO partitions optimizer states and gradients across devices to remove memory redundancy in parallel training."}],"snapshot_sha256":"fed046d505278de78c538980ee62cc7728f9370deeb337a2015fab3ca89efd8c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"3615ab0fe103150bdf437367540d9e4304f066ef41e6d15fa1275ecfbbad8550"},"paper":{"abstract_excerpt":"Large deep learning models offer significant accuracy gains, but training billions to trillions of parameters is challenging. Existing solutions such as data and model parallelisms exhibit fundamental limitations to fit these models into limited device memory, while obtaining computation, communication and development efficiency. We develop a novel solution, Zero Redundancy Optimizer (ZeRO), to optimize memory, vastly improving training speed while increasing the model size that can be efficiently trained. ZeRO eliminates memory redundancies in data- and model-parallel training while retaining","authors_text":"Jeff Rasley, Olatunji Ruwase, Samyam Rajbhandari, Yuxiong He","cross_cats":["cs.DC","stat.ML"],"headline":"ZeRO partitions optimizer states and gradients across devices to remove memory redundancy in parallel training.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-10-04T17:29:39Z","title":"ZeRO: Memory Optimizations Toward Training Trillion Parameter Models"},"references":{"count":26,"internal_anchors":9,"resolved_work":26,"sample":[{"cited_arxiv_id":"1810.04805","doi":"","is_internal_anchor":true,"ref_index":1,"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Language models are unsupervised multitask learners","work_id":"47eda437-4651-49ed-bbe6-d63c3b7a78a9","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","work_id":"6dc163d1-d970-4fb3-9587-d5a203ab3150","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Colin Raﬀel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the limits of transfer learn- ing with a uniﬁed text-to-text tran","work_id":"a750effe-a64d-4fa6-b86b-425475f72f39","year":2019},{"cited_arxiv_id":"1811.02084","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Nimit Sharad Sohoni, Christopher Richard Aberger, Megan Leszczynski, Jian Zhang, and Christo- pher R´e","work_id":"b2e01cf2-a2ed-470f-879d-29d3ba18624f","year":2018}],"snapshot_sha256":"6ac7d75af76466d71edc9d6819b30d58f7e0540e2b455fe24106fd3131c35e45"},"source":{"id":"1910.02054","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T09:20:59.131371Z","id":"96ad60f6-935f-44ea-a8c6-1d8168a2b1d0","model_set":{"reader":"grok-4.3"},"one_line_summary":"ZeRO removes memory redundancies in parallel training to scale deep learning models to over a trillion parameters with high throughput on current hardware.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"ZeRO partitions optimizer states and gradients across devices to remove memory redundancy in parallel training.","strongest_claim":"ZeRO eliminates memory redundancies in data- and model-parallel training while retaining low communication volume and high computational granularity, allowing us to scale the model size proportional to the number of devices with sustained high efficiency. Our analysis demonstrates ZeRO has the potential to scale beyond 1 Trillion parameters using today's hardware.","weakest_assumption":"The assumption that partitioning optimizer states and gradients will not introduce new communication bottlenecks or synchronization overheads that scale worse than linearly when moving to thousands of devices."}},"verdict_id":"96ad60f6-935f-44ea-a8c6-1d8168a2b1d0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2fe585a2cae5a603e9cd569601364acaf2e99cb996b9844503ba88306c301480","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"769410855d6e6defbf18a87865b61cd2c4373b74c87a93f622ec300280dd1a77","cross_cats_sorted":["cs.DC","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-10-04T17:29:39Z","title_canon_sha256":"5c51bb8d9d15dc00904edb477c9632c6ae88312b10fbfa1a9d71978551cf7643"},"schema_version":"1.0","source":{"id":"1910.02054","kind":"arxiv","version":3}},"canonical_sha256":"59bfe2bb85ba230029eb6a6d800dc75da176779950b8cf7ce12fe03970dfb98d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"59bfe2bb85ba230029eb6a6d800dc75da176779950b8cf7ce12fe03970dfb98d","first_computed_at":"2026-05-17T23:38:48.364346Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.364346Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"PBoYjcEAJHLwGKkGpkXMYLMxB1pvRcJkJUjL8YdmV4AXfhNqavUrrBhYxFBl83b2iH0QeKNCVbhpwCTkMRvSDg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.365001Z","signed_message":"canonical_sha256_bytes"},"source_id":"1910.02054","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2fe585a2cae5a603e9cd569601364acaf2e99cb996b9844503ba88306c301480","sha256:c319e829a4037356c9248e48bd7c3a58f9c2c164222e388885e96e7e16fdca14"],"state_sha256":"1a7acb9d8ee16a272870e93ccf3691c2b94ee3c90c30174f508c9e6737bd9021"}