{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:A5KE4HW5NPUAPFV3W7D3PQL3A7","short_pith_number":"pith:A5KE4HW5","schema_version":"1.0","canonical_sha256":"07544e1edd6be80796bbb7c7b7c17b07c1d84aeff71194f46b86cb7767ea72f5","source":{"kind":"arxiv","id":"1712.02029","version":2},"attestation_state":"computed","paper":{"title":"AdaBatch: Adaptive Batch Sizes for Training Deep Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.DC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Aditya Devarakonda, Maxim Naumov, Michael Garland","submitted_at":"2017-12-06T04:19:14Z","abstract_excerpt":"Training deep neural networks with Stochastic Gradient Descent, or its variants, requires careful choice of both learning rate and batch size. While smaller batch sizes generally converge in fewer training epochs, larger batch sizes offer more parallelism and hence better computational efficiency. We have developed a new training approach that, rather than statically choosing a single batch size for all epochs, adaptively increases the batch size during the training process. Our method delivers the convergence rate of small batch sizes while achieving performance similar to large batch sizes. "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1712.02029","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-12-06T04:19:14Z","cross_cats_sorted":["cs.CV","cs.DC","stat.ML"],"title_canon_sha256":"5fe58d15f60cdab5d55ec0fa3e1a3eaba2c1696d3f95d9b0226330342c3016d6","abstract_canon_sha256":"7d050bfa8e36256f0a37f9312bb53dc2873f07b1fce9219b3461875f02343c4e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:23:22.799374Z","signature_b64":"ODhYqYOCfx2/IGjxJR4qkiB0oPtr/SWNYnpgBoSf1t170ufA0m4GlpGM2T4Fdg/Gzg2bxzwP3gzD6EcjAQyWCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"07544e1edd6be80796bbb7c7b7c17b07c1d84aeff71194f46b86cb7767ea72f5","last_reissued_at":"2026-05-18T00:23:22.798655Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:23:22.798655Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"AdaBatch: Adaptive Batch Sizes for Training Deep Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.DC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Aditya Devarakonda, Maxim Naumov, Michael Garland","submitted_at":"2017-12-06T04:19:14Z","abstract_excerpt":"Training deep neural networks with Stochastic Gradient Descent, or its variants, requires careful choice of both learning rate and batch size. While smaller batch sizes generally converge in fewer training epochs, larger batch sizes offer more parallelism and hence better computational efficiency. We have developed a new training approach that, rather than statically choosing a single batch size for all epochs, adaptively increases the batch size during the training process. Our method delivers the convergence rate of small batch sizes while achieving performance similar to large batch sizes. "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1712.02029","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1712.02029","created_at":"2026-05-18T00:23:22.798757+00:00"},{"alias_kind":"arxiv_version","alias_value":"1712.02029v2","created_at":"2026-05-18T00:23:22.798757+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1712.02029","created_at":"2026-05-18T00:23:22.798757+00:00"},{"alias_kind":"pith_short_12","alias_value":"A5KE4HW5NPUA","created_at":"2026-05-18T12:31:05.417338+00:00"},{"alias_kind":"pith_short_16","alias_value":"A5KE4HW5NPUAPFV3","created_at":"2026-05-18T12:31:05.417338+00:00"},{"alias_kind":"pith_short_8","alias_value":"A5KE4HW5","created_at":"2026-05-18T12:31:05.417338+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"1904.00962","citing_title":"Large Batch Optimization for Deep Learning: Training BERT in 76 minutes","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":128,"is_internal_anchor":true},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":170,"is_internal_anchor":false},{"citing_arxiv_id":"2604.06350","citing_title":"Convergence of Riemannian Stochastic Gradient Descents: Varying Batch Sizes And Nonstandard Batch Forming","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":247,"is_internal_anchor":false},{"citing_arxiv_id":"2604.26687","citing_title":"COPUS: Co-adaptive Parallelism and Batch Size Selection in Large Language Model Training","ref_index":4,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7","json":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7.json","graph_json":"https://pith.science/api/pith-number/A5KE4HW5NPUAPFV3W7D3PQL3A7/graph.json","events_json":"https://pith.science/api/pith-number/A5KE4HW5NPUAPFV3W7D3PQL3A7/events.json","paper":"https://pith.science/paper/A5KE4HW5"},"agent_actions":{"view_html":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7","download_json":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7.json","view_paper":"https://pith.science/paper/A5KE4HW5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1712.02029&json=true","fetch_graph":"https://pith.science/api/pith-number/A5KE4HW5NPUAPFV3W7D3PQL3A7/graph.json","fetch_events":"https://pith.science/api/pith-number/A5KE4HW5NPUAPFV3W7D3PQL3A7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7/action/storage_attestation","attest_author":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7/action/author_attestation","sign_citation":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7/action/citation_signature","submit_replication":"https://pith.science/pith/A5KE4HW5NPUAPFV3W7D3PQL3A7/action/replication_record"}},"created_at":"2026-05-18T00:23:22.798757+00:00","updated_at":"2026-05-18T00:23:22.798757+00:00"}