{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:TQYZNWM7LTX7NZE7BQU2WBYOZF","short_pith_number":"pith:TQYZNWM7","schema_version":"1.0","canonical_sha256":"9c3196d99f5ceff6e49f0c29ab070ec956f92bc8f5620b730d9d9c7b29a43060","source":{"kind":"arxiv","id":"1812.06162","version":1},"attestation_state":"computed","paper":{"title":"An Empirical Model of Large-Batch Training","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Dario Amodei, Jared Kaplan, OpenAI Dota Team, Sam McCandlish","submitted_at":"2018-12-14T20:49:09Z","abstract_excerpt":"In an increasing number of domains it has been demonstrated that deep learning models can be trained using relatively large batch sizes without sacrificing data efficiency. However the limits of this massive data parallelism seem to differ from domain to domain, ranging from batches of tens of thousands in ImageNet to batches of millions in RL agents that play the game Dota 2. To our knowledge there is limited conceptual understanding of why these limits to batch size differ or how we might choose the correct batch size in a new domain. In this paper, we demonstrate that a simple and easy-to-m"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1812.06162","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-12-14T20:49:09Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"b04d8b12c826ea6cf0c514b13972734c91eb0585367f99ef5906b84d0c0df528","abstract_canon_sha256":"80c77b30385891d7316bef8c2d7a2c6fa3f2279ad8e4efc7906910fd3d019abb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:58:12.990589Z","signature_b64":"9YFEhR2OtOVDXlk6UgpMwdlL+c87gUjucRMYFV10b36NCeWOMxm5Hm9RELoDTUM8IE8Quy9nKAViCvm1n4KpAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9c3196d99f5ceff6e49f0c29ab070ec956f92bc8f5620b730d9d9c7b29a43060","last_reissued_at":"2026-05-17T23:58:12.990057Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:58:12.990057Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"An Empirical Model of Large-Batch Training","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Dario Amodei, Jared Kaplan, OpenAI Dota Team, Sam McCandlish","submitted_at":"2018-12-14T20:49:09Z","abstract_excerpt":"In an increasing number of domains it has been demonstrated that deep learning models can be trained using relatively large batch sizes without sacrificing data efficiency. However the limits of this massive data parallelism seem to differ from domain to domain, ranging from batches of tens of thousands in ImageNet to batches of millions in RL agents that play the game Dota 2. To our knowledge there is limited conceptual understanding of why these limits to batch size differ or how we might choose the correct batch size in a new domain. In this paper, we demonstrate that a simple and easy-to-m"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.06162","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1812.06162","created_at":"2026-05-17T23:58:12.990138+00:00"},{"alias_kind":"arxiv_version","alias_value":"1812.06162v1","created_at":"2026-05-17T23:58:12.990138+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.06162","created_at":"2026-05-17T23:58:12.990138+00:00"},{"alias_kind":"pith_short_12","alias_value":"TQYZNWM7LTX7","created_at":"2026-05-18T12:32:56.356000+00:00"},{"alias_kind":"pith_short_16","alias_value":"TQYZNWM7LTX7NZE7","created_at":"2026-05-18T12:32:56.356000+00:00"},{"alias_kind":"pith_short_8","alias_value":"TQYZNWM7","created_at":"2026-05-18T12:32:56.356000+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":30,"internal_anchor_count":14,"sample":[{"citing_arxiv_id":"2505.24275","citing_title":"GradPower: Powering Gradients for Faster Language Model Pre-Training","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21557","citing_title":"Scalable Reinforcement Learning via Adaptive Batch Scaling","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16844","citing_title":"Artificial Adaptive Intelligence: The Missing Stage Between Narrow and General Intelligence","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06435","citing_title":"A Comprehensive Overview of Large Language Models","ref_index":107,"is_internal_anchor":true},{"citing_arxiv_id":"2507.00432","citing_title":"Does Math Reasoning Improve General LLM Capabilities? Understanding Transferability of LLM Reasoning","ref_index":240,"is_internal_anchor":true},{"citing_arxiv_id":"2510.04686","citing_title":"How does the optimizer implicitly bias the model merging loss landscape?","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"1910.02054","citing_title":"ZeRO: Memory Optimizations Toward Training Trillion Parameter Models","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2304.01373","citing_title":"Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling","ref_index":126,"is_internal_anchor":true},{"citing_arxiv_id":"2603.11178","citing_title":"PACED: Distillation and On-Policy Self-Distillation at the Frontier of Student Competence","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2603.17771","citing_title":"Attention Sinks Induce Gradient Sinks: Massive Activations as Gradient Regulators in Transformers","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22347","citing_title":"Intelligence Inertia: Physical Isomorphism and Applications","ref_index":73,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14200","citing_title":"How to Scale Mixture-of-Experts: From muP to the Maximally Scale-Stable Parameterization","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28743","citing_title":"Rethinking Language Model Scaling under Transferable Hypersphere Optimization","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2010.14701","citing_title":"Scaling Laws for Autoregressive Generative Modeling","ref_index":17,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11255","citing_title":"HEBATRON: A Hebrew-Specialized Open-Weight Mixture-of-Experts Language Model","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"1912.06680","citing_title":"Dota 2 with Large Scale Deep Reinforcement Learning","ref_index":30,"is_internal_anchor":false},{"citing_arxiv_id":"2604.28118","citing_title":"DEFault++: Automated Fault Detection, Categorization, and Diagnosis for Transformer Architectures","ref_index":20,"is_internal_anchor":false},{"citing_arxiv_id":"2505.06708","citing_title":"Gated Attention for Large Language Models: Non-linearity, Sparsity, and Attention-Sink-Free","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09154","citing_title":"Predicting Large Model Test Losses with a Noisy Quadratic System","ref_index":21,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05683","citing_title":"Spectral Lens: Activation and Gradient Spectra as Diagnostics of LLM Optimization","ref_index":23,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02850","citing_title":"Quantum Tilted Loss in Variational Optimization: Theory and Applications","ref_index":73,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21691","citing_title":"There Will Be a Scientific Theory of Deep Learning","ref_index":84,"is_internal_anchor":false},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":124,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21215","citing_title":"The Recurrent Transformer: Greater Effective Depth and Efficient Decoding","ref_index":66,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF","json":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF.json","graph_json":"https://pith.science/api/pith-number/TQYZNWM7LTX7NZE7BQU2WBYOZF/graph.json","events_json":"https://pith.science/api/pith-number/TQYZNWM7LTX7NZE7BQU2WBYOZF/events.json","paper":"https://pith.science/paper/TQYZNWM7"},"agent_actions":{"view_html":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF","download_json":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF.json","view_paper":"https://pith.science/paper/TQYZNWM7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1812.06162&json=true","fetch_graph":"https://pith.science/api/pith-number/TQYZNWM7LTX7NZE7BQU2WBYOZF/graph.json","fetch_events":"https://pith.science/api/pith-number/TQYZNWM7LTX7NZE7BQU2WBYOZF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF/action/storage_attestation","attest_author":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF/action/author_attestation","sign_citation":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF/action/citation_signature","submit_replication":"https://pith.science/pith/TQYZNWM7LTX7NZE7BQU2WBYOZF/action/replication_record"}},"created_at":"2026-05-17T23:58:12.990138+00:00","updated_at":"2026-05-17T23:58:12.990138+00:00"}