{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:MOFG3A7QWBDAUINH6DWUZE566B","short_pith_number":"pith:MOFG3A7Q","schema_version":"1.0","canonical_sha256":"638a6d83f0b0460a21a7f0ed4c93bef07b4a95d84d80c87368e8a94b82923888","source":{"kind":"arxiv","id":"1602.06709","version":1},"attestation_state":"computed","paper":{"title":"Distributed Deep Learning Using Synchronous Stochastic Gradient Descent","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.DC","authors_text":"Bharat Kaul, Dheevatsa Mudigere, Dhiraj Kalamkar, Dipankar Das, Karthikeyan Vaidynathan, Pradeep Dubey, Sasikanth Avancha, Srinivas Sridharan","submitted_at":"2016-02-22T10:31:24Z","abstract_excerpt":"We design and implement a distributed multinode synchronous SGD algorithm, without altering hyper parameters, or compressing data, or altering algorithmic behavior. We perform a detailed analysis of scaling, and identify optimal design points for different networks. We demonstrate scaling of CNNs on 100s of nodes, and present what we believe to be record training throughputs. A 512 minibatch VGG-A CNN training run is scaled 90X on 128 nodes. Also 256 minibatch VGG-A and OverFeat-FAST networks are scaled 53X and 42X respectively on a 64 node cluster. We also demonstrate the generality of our ap"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1602.06709","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2016-02-22T10:31:24Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"a99ac66f0835796e5e8d2935fba5bc4efb0f1b31e0a58c66d37edce752252eac","abstract_canon_sha256":"f3a8ff3767a444cc54fbe86d7d999df47734f7a20c123c75066b1c5920abea1e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:19:49.397193Z","signature_b64":"AAQoRWoZT8YXMiMQCSCRTiAQkMm19YHbXTM5JbaY9AUWovLjXUc6JaGgm73CpE9ffjBFbybcWsHNzt7WYQe/Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"638a6d83f0b0460a21a7f0ed4c93bef07b4a95d84d80c87368e8a94b82923888","last_reissued_at":"2026-05-18T01:19:49.396726Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:19:49.396726Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Distributed Deep Learning Using Synchronous Stochastic Gradient Descent","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.DC","authors_text":"Bharat Kaul, Dheevatsa Mudigere, Dhiraj Kalamkar, Dipankar Das, Karthikeyan Vaidynathan, Pradeep Dubey, Sasikanth Avancha, Srinivas Sridharan","submitted_at":"2016-02-22T10:31:24Z","abstract_excerpt":"We design and implement a distributed multinode synchronous SGD algorithm, without altering hyper parameters, or compressing data, or altering algorithmic behavior. We perform a detailed analysis of scaling, and identify optimal design points for different networks. We demonstrate scaling of CNNs on 100s of nodes, and present what we believe to be record training throughputs. A 512 minibatch VGG-A CNN training run is scaled 90X on 128 nodes. Also 256 minibatch VGG-A and OverFeat-FAST networks are scaled 53X and 42X respectively on a 64 node cluster. We also demonstrate the generality of our ap"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1602.06709","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1602.06709","created_at":"2026-05-18T01:19:49.396799+00:00"},{"alias_kind":"arxiv_version","alias_value":"1602.06709v1","created_at":"2026-05-18T01:19:49.396799+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1602.06709","created_at":"2026-05-18T01:19:49.396799+00:00"},{"alias_kind":"pith_short_12","alias_value":"MOFG3A7QWBDA","created_at":"2026-05-18T12:30:32.724797+00:00"},{"alias_kind":"pith_short_16","alias_value":"MOFG3A7QWBDAUINH","created_at":"2026-05-18T12:30:32.724797+00:00"},{"alias_kind":"pith_short_8","alias_value":"MOFG3A7Q","created_at":"2026-05-18T12:30:32.724797+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"1609.04836","citing_title":"On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima","ref_index":3,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B","json":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B.json","graph_json":"https://pith.science/api/pith-number/MOFG3A7QWBDAUINH6DWUZE566B/graph.json","events_json":"https://pith.science/api/pith-number/MOFG3A7QWBDAUINH6DWUZE566B/events.json","paper":"https://pith.science/paper/MOFG3A7Q"},"agent_actions":{"view_html":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B","download_json":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B.json","view_paper":"https://pith.science/paper/MOFG3A7Q","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1602.06709&json=true","fetch_graph":"https://pith.science/api/pith-number/MOFG3A7QWBDAUINH6DWUZE566B/graph.json","fetch_events":"https://pith.science/api/pith-number/MOFG3A7QWBDAUINH6DWUZE566B/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B/action/storage_attestation","attest_author":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B/action/author_attestation","sign_citation":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B/action/citation_signature","submit_replication":"https://pith.science/pith/MOFG3A7QWBDAUINH6DWUZE566B/action/replication_record"}},"created_at":"2026-05-18T01:19:49.396799+00:00","updated_at":"2026-05-18T01:19:49.396799+00:00"}