{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2020:WVMNY27ORQRCGY3BXFUU6EKCWV","short_pith_number":"pith:WVMNY27O","canonical_record":{"source":{"id":"2006.15704","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.DC","submitted_at":"2020-06-28T20:39:45Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"56fb8a04d11c1bb768f2d45e28e499a3d58de29e597e84838d5d5da921881e1f","abstract_canon_sha256":"bbd501c503bf854713f3294e6d4239118a5d4d86df4399aa29fe93a2e5f475e5"},"schema_version":"1.0"},"canonical_sha256":"b558dc6bee8c22236361b9694f1142b57431b7e931c3b1b02fcbf29e0e94bffe","source":{"kind":"arxiv","id":"2006.15704","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2006.15704","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2006.15704v1","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2006.15704","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"WVMNY27ORQRC","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"WVMNY27ORQRCGY3B","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"WVMNY27O","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2020:WVMNY27ORQRCGY3BXFUU6EKCWV","target":"record","payload":{"canonical_record":{"source":{"id":"2006.15704","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.DC","submitted_at":"2020-06-28T20:39:45Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"56fb8a04d11c1bb768f2d45e28e499a3d58de29e597e84838d5d5da921881e1f","abstract_canon_sha256":"bbd501c503bf854713f3294e6d4239118a5d4d86df4399aa29fe93a2e5f475e5"},"schema_version":"1.0"},"canonical_sha256":"b558dc6bee8c22236361b9694f1142b57431b7e931c3b1b02fcbf29e0e94bffe","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.320230Z","signature_b64":"EPV+XAqSb2rsZLlMQRGh6MacQKwG8+0/nZlf8q2Uf3gRHtyxH+C2hY23pAeDgZ38dlrgEhSgK0tXsqIsmP89Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b558dc6bee8c22236361b9694f1142b57431b7e931c3b1b02fcbf29e0e94bffe","last_reissued_at":"2026-05-17T23:38:13.319742Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.319742Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2006.15704","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"gNdP0PnUXxq3o7m8XUsXYgxCKHKAoWIql3Tnbh8Qz8WDCDJ67NIgkye3AcJ7UqGNbghv7mbrbyV0zeF0Xo+BAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T05:25:17.035310Z"},"content_sha256":"06e99f3665cf22d6748dd1a4b4fe0a879a950b5e04ad52606a7bf803139dea80","schema_version":"1.0","event_id":"sha256:06e99f3665cf22d6748dd1a4b4fe0a879a950b5e04ad52606a7bf803139dea80"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2020:WVMNY27ORQRCGY3BXFUU6EKCWV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"PyTorch's distributed data parallel module achieves near-linear scaling to 256 GPUs by overlapping computation with communication.","cross_cats":["cs.LG"],"primary_cat":"cs.DC","authors_text":"Adam Paszke, Brian Vaughan, Jeff Smith, Omkar Salpekar, Pieter Noordhuis, Pritam Damania, Rohan Varma, Shen Li, Soumith Chintala, Teng Li, Yanli Zhao","submitted_at":"2020-06-28T20:39:45Z","abstract_excerpt":"This paper presents the design, implementation, and evaluation of the PyTorch distributed data parallel module. PyTorch is a widely-adopted scientific computing package used in deep learning research and applications. Recent advances in deep learning argue for the value of large datasets and large models, which necessitates the ability to scale out model training to more computational resources. Data parallelism has emerged as a popular solution for distributed training thanks to its straightforward principle and broad applicability. In general, the technique of distributed data parallelism re"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Evaluations show that, when configured appropriately, the PyTorch distributed data parallel module attains near-linear scalability using 256 GPUs.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that typical deep learning models have enough computation per layer to effectively overlap with gradient communication and that the underlying network fabric supports low-latency all-reduce operations at the tested scale.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"PyTorch distributed data parallel attains near-linear scalability on 256 GPUs through gradient bucketing, computation-communication overlap, and selective synchronization skipping.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"PyTorch's distributed data parallel module achieves near-linear scaling to 256 GPUs by overlapping computation with communication.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"bac7834c700c915c21489719e51680e26098d3fe8029a6aa988ff9bc19ea4d6c"},"source":{"id":"2006.15704","kind":"arxiv","version":1},"verdict":{"id":"ed5f0378-f648-4cb2-98c6-4432d51cbae2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T19:09:51.262341Z","strongest_claim":"Evaluations show that, when configured appropriately, the PyTorch distributed data parallel module attains near-linear scalability using 256 GPUs.","one_line_summary":"PyTorch distributed data parallel attains near-linear scalability on 256 GPUs through gradient bucketing, computation-communication overlap, and selective synchronization skipping.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that typical deep learning models have enough computation per layer to effectively overlap with gradient communication and that the underlying network fabric supports low-latency all-reduce operations at the tested scale.","pith_extraction_headline":"PyTorch's distributed data parallel module achieves near-linear scaling to 256 GPUs by overlapping computation with communication."},"references":{"count":48,"sample":[{"doi":"","year":2006,"title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training","work_id":"353279b8-3b33-45fd-9b64-41e5bd1708b9","ref_index":1,"cited_arxiv_id":"2006.15704","is_internal_anchor":true},{"doi":"","year":null,"title":"Then, we explain and justify the idea of data parallelism and describe communication primitives","work_id":"63ffbee3-2352-45e0-b30f-65e8698633b0","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"During distributed training, each pro- cess has its own local model replica and local optimizer","work_id":"e7e10062-1089-40c0-8004-fc2537bb126a","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"This section focus on the current status as of PyTorch v1.5.0","work_id":"96d96112-f4f7-4336-ac4a-53e4bab42d7f","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"In the exclusive cluster, the GPUs are located on 4 servers, connected using Mellanox MT27700 ConnectX-4 100GB/s NIC","work_id":"3ebc3ab6-6d37-40b7-b04f-ebff8257f8ed","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":48,"snapshot_sha256":"9d526b9adeb1a0aaff8e34edee60eb08c01dbd8db2679c2d899465733c01ebd9","internal_anchors":6},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ed5f0378-f648-4cb2-98c6-4432d51cbae2"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KJjl0fKNbx+iGUATIWY7IgyKKLp4Pry+sKx1ZZmiByNPkvG13IPZbcyWdu0RpLM+YhJvQhSf4nMByutjrWr+AA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T05:25:17.036348Z"},"content_sha256":"63c6e16989ad748e361851ae9874f8b00c7f76fe073c1724b52187653569fa12","schema_version":"1.0","event_id":"sha256:63c6e16989ad748e361851ae9874f8b00c7f76fe073c1724b52187653569fa12"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WVMNY27ORQRCGY3BXFUU6EKCWV/bundle.json","state_url":"https://pith.science/pith/WVMNY27ORQRCGY3BXFUU6EKCWV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WVMNY27ORQRCGY3BXFUU6EKCWV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-23T05:25:17Z","links":{"resolver":"https://pith.science/pith/WVMNY27ORQRCGY3BXFUU6EKCWV","bundle":"https://pith.science/pith/WVMNY27ORQRCGY3BXFUU6EKCWV/bundle.json","state":"https://pith.science/pith/WVMNY27ORQRCGY3BXFUU6EKCWV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WVMNY27ORQRCGY3BXFUU6EKCWV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2020:WVMNY27ORQRCGY3BXFUU6EKCWV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"bbd501c503bf854713f3294e6d4239118a5d4d86df4399aa29fe93a2e5f475e5","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.DC","submitted_at":"2020-06-28T20:39:45Z","title_canon_sha256":"56fb8a04d11c1bb768f2d45e28e499a3d58de29e597e84838d5d5da921881e1f"},"schema_version":"1.0","source":{"id":"2006.15704","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2006.15704","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2006.15704v1","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2006.15704","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"WVMNY27ORQRC","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"WVMNY27ORQRCGY3B","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"WVMNY27O","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:63c6e16989ad748e361851ae9874f8b00c7f76fe073c1724b52187653569fa12","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Evaluations show that, when configured appropriately, the PyTorch distributed data parallel module attains near-linear scalability using 256 GPUs."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that typical deep learning models have enough computation per layer to effectively overlap with gradient communication and that the underlying network fabric supports low-latency all-reduce operations at the tested scale."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"PyTorch distributed data parallel attains near-linear scalability on 256 GPUs through gradient bucketing, computation-communication overlap, and selective synchronization skipping."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"PyTorch's distributed data parallel module achieves near-linear scaling to 256 GPUs by overlapping computation with communication."}],"snapshot_sha256":"bac7834c700c915c21489719e51680e26098d3fe8029a6aa988ff9bc19ea4d6c"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This paper presents the design, implementation, and evaluation of the PyTorch distributed data parallel module. PyTorch is a widely-adopted scientific computing package used in deep learning research and applications. Recent advances in deep learning argue for the value of large datasets and large models, which necessitates the ability to scale out model training to more computational resources. Data parallelism has emerged as a popular solution for distributed training thanks to its straightforward principle and broad applicability. In general, the technique of distributed data parallelism re","authors_text":"Adam Paszke, Brian Vaughan, Jeff Smith, Omkar Salpekar, Pieter Noordhuis, Pritam Damania, Rohan Varma, Shen Li, Soumith Chintala, Teng Li, Yanli Zhao","cross_cats":["cs.LG"],"headline":"PyTorch's distributed data parallel module achieves near-linear scaling to 256 GPUs by overlapping computation with communication.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.DC","submitted_at":"2020-06-28T20:39:45Z","title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training"},"references":{"count":48,"internal_anchors":6,"resolved_work":48,"sample":[{"cited_arxiv_id":"2006.15704","doi":"","is_internal_anchor":true,"ref_index":1,"title":"PyTorch Distributed: Experiences on Accelerating Data Parallel Training","work_id":"353279b8-3b33-45fd-9b64-41e5bd1708b9","year":2006},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Then, we explain and justify the idea of data parallelism and describe communication primitives","work_id":"63ffbee3-2352-45e0-b30f-65e8698633b0","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"During distributed training, each pro- cess has its own local model replica and local optimizer","work_id":"e7e10062-1089-40c0-8004-fc2537bb126a","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"This section focus on the current status as of PyTorch v1.5.0","work_id":"96d96112-f4f7-4336-ac4a-53e4bab42d7f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"In the exclusive cluster, the GPUs are located on 4 servers, connected using Mellanox MT27700 ConnectX-4 100GB/s NIC","work_id":"3ebc3ab6-6d37-40b7-b04f-ebff8257f8ed","year":null}],"snapshot_sha256":"9d526b9adeb1a0aaff8e34edee60eb08c01dbd8db2679c2d899465733c01ebd9"},"source":{"id":"2006.15704","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T19:09:51.262341Z","id":"ed5f0378-f648-4cb2-98c6-4432d51cbae2","model_set":{"reader":"grok-4.3"},"one_line_summary":"PyTorch distributed data parallel attains near-linear scalability on 256 GPUs through gradient bucketing, computation-communication overlap, and selective synchronization skipping.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"PyTorch's distributed data parallel module achieves near-linear scaling to 256 GPUs by overlapping computation with communication.","strongest_claim":"Evaluations show that, when configured appropriately, the PyTorch distributed data parallel module attains near-linear scalability using 256 GPUs.","weakest_assumption":"The assumption that typical deep learning models have enough computation per layer to effectively overlap with gradient communication and that the underlying network fabric supports low-latency all-reduce operations at the tested scale."}},"verdict_id":"ed5f0378-f648-4cb2-98c6-4432d51cbae2"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:06e99f3665cf22d6748dd1a4b4fe0a879a950b5e04ad52606a7bf803139dea80","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"bbd501c503bf854713f3294e6d4239118a5d4d86df4399aa29fe93a2e5f475e5","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.DC","submitted_at":"2020-06-28T20:39:45Z","title_canon_sha256":"56fb8a04d11c1bb768f2d45e28e499a3d58de29e597e84838d5d5da921881e1f"},"schema_version":"1.0","source":{"id":"2006.15704","kind":"arxiv","version":1}},"canonical_sha256":"b558dc6bee8c22236361b9694f1142b57431b7e931c3b1b02fcbf29e0e94bffe","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b558dc6bee8c22236361b9694f1142b57431b7e931c3b1b02fcbf29e0e94bffe","first_computed_at":"2026-05-17T23:38:13.319742Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.319742Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"EPV+XAqSb2rsZLlMQRGh6MacQKwG8+0/nZlf8q2Uf3gRHtyxH+C2hY23pAeDgZ38dlrgEhSgK0tXsqIsmP89Bg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.320230Z","signed_message":"canonical_sha256_bytes"},"source_id":"2006.15704","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:06e99f3665cf22d6748dd1a4b4fe0a879a950b5e04ad52606a7bf803139dea80","sha256:63c6e16989ad748e361851ae9874f8b00c7f76fe073c1724b52187653569fa12"],"state_sha256":"17842838ec8cd81fdeea083a4253e689f15b6345e83fa87105eb0d1345edfaa6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Rca2OhvgR6+/gTB5oGn/rcvrrFgHjsWGBZ3GTXaxlicY5/hQkvc13eax5p/QRmfvdq/B41qVxQFj+pkoPaK8BA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-23T05:25:17.041289Z","bundle_sha256":"75272774e62efb2be5726ee2322b36936f586f1d0490a8413f174ed2d7253a81"}}