{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:XTSUYJTUK6ZORCBTRA2MFHAPNF","short_pith_number":"pith:XTSUYJTU","schema_version":"1.0","canonical_sha256":"bce54c267457b2e888338834c29c0f695af0e57deab72e5aeba9661c8cda6ad7","source":{"kind":"arxiv","id":"1805.09767","version":3},"attestation_state":"computed","paper":{"title":"Local SGD Converges Fast and Communicates Little","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.DC","cs.LG"],"primary_cat":"math.OC","authors_text":"Sebastian U. Stich","submitted_at":"2018-05-24T16:38:51Z","abstract_excerpt":"Mini-batch stochastic gradient descent (SGD) is state of the art in large scale distributed training. The scheme can reach a linear speedup with respect to the number of workers, but this is rarely seen in practice as the scheme often suffers from large network delays and bandwidth limits. To overcome this communication bottleneck recent works propose to reduce the communication frequency. An algorithm of this type is local SGD that runs SGD independently in parallel on different workers and averages the sequences only once in a while.\n  This scheme shows promising results in practice, but elu"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1805.09767","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"math.OC","submitted_at":"2018-05-24T16:38:51Z","cross_cats_sorted":["cs.DC","cs.LG"],"title_canon_sha256":"d3c61ea3914cbaba5ed0d86cdcbb73712b9577d2cd10e261d35f36e262013989","abstract_canon_sha256":"38915077aef79005b39371209d020f8eaf928fab4d3ed638d962fab39cde1bef"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:47:07.443000Z","signature_b64":"ryng13ZZWPfuxtX5eFXhNwUXdyIaCqyvthYkJtyyKEnJaSiGY2Ppk2nL+s89pi9MbQrbJcML+/dzybuoSaHKDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bce54c267457b2e888338834c29c0f695af0e57deab72e5aeba9661c8cda6ad7","last_reissued_at":"2026-05-17T23:47:07.442333Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:47:07.442333Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Local SGD Converges Fast and Communicates Little","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.DC","cs.LG"],"primary_cat":"math.OC","authors_text":"Sebastian U. Stich","submitted_at":"2018-05-24T16:38:51Z","abstract_excerpt":"Mini-batch stochastic gradient descent (SGD) is state of the art in large scale distributed training. The scheme can reach a linear speedup with respect to the number of workers, but this is rarely seen in practice as the scheme often suffers from large network delays and bandwidth limits. To overcome this communication bottleneck recent works propose to reduce the communication frequency. An algorithm of this type is local SGD that runs SGD independently in parallel on different workers and averages the sequences only once in a while.\n  This scheme shows promising results in practice, but elu"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1805.09767","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1805.09767","created_at":"2026-05-17T23:47:07.442449+00:00"},{"alias_kind":"arxiv_version","alias_value":"1805.09767v3","created_at":"2026-05-17T23:47:07.442449+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1805.09767","created_at":"2026-05-17T23:47:07.442449+00:00"},{"alias_kind":"pith_short_12","alias_value":"XTSUYJTUK6ZO","created_at":"2026-05-18T12:33:01.666342+00:00"},{"alias_kind":"pith_short_16","alias_value":"XTSUYJTUK6ZORCBT","created_at":"2026-05-18T12:33:01.666342+00:00"},{"alias_kind":"pith_short_8","alias_value":"XTSUYJTU","created_at":"2026-05-18T12:33:01.666342+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":7,"internal_anchor_count":7,"sample":[{"citing_arxiv_id":"2605.20353","citing_title":"Synchronous and Asynchronous Parallelism Approaches for Generalized Canonical Polyadic Tensor Decomposition with GenTen","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15573","citing_title":"Response-Conditioned Parallel-to-Sequential Orchestration for Multi-Agent Systems","ref_index":143,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18656","citing_title":"Statistical Limits and Efficient Algorithms for Differentially Private Federated Learning","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19629","citing_title":"Gaussian Approximation and Multiplier Bootstrap for Federated Linear Stochastic Approximation","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16184","citing_title":"Runtime-Orchestrated Second-Order Optimization for Scalable LLM Training","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14350","citing_title":"Distributionally Robust Multi-Task Reinforcement Learning via Adaptive Task Sampling","ref_index":233,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13434","citing_title":"Rescaled Asynchronous SGD: Optimal Distributed Optimization under Data and System Heterogeneity","ref_index":88,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF","json":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF.json","graph_json":"https://pith.science/api/pith-number/XTSUYJTUK6ZORCBTRA2MFHAPNF/graph.json","events_json":"https://pith.science/api/pith-number/XTSUYJTUK6ZORCBTRA2MFHAPNF/events.json","paper":"https://pith.science/paper/XTSUYJTU"},"agent_actions":{"view_html":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF","download_json":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF.json","view_paper":"https://pith.science/paper/XTSUYJTU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1805.09767&json=true","fetch_graph":"https://pith.science/api/pith-number/XTSUYJTUK6ZORCBTRA2MFHAPNF/graph.json","fetch_events":"https://pith.science/api/pith-number/XTSUYJTUK6ZORCBTRA2MFHAPNF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF/action/storage_attestation","attest_author":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF/action/author_attestation","sign_citation":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF/action/citation_signature","submit_replication":"https://pith.science/pith/XTSUYJTUK6ZORCBTRA2MFHAPNF/action/replication_record"}},"created_at":"2026-05-17T23:47:07.442449+00:00","updated_at":"2026-05-17T23:47:07.442449+00:00"}