{"work":{"id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","openalex_id":null,"doi":null,"arxiv_id":"2001.08361","raw_key":null,"title":"Scaling Laws for Neural Language Models","authors":null,"authors_text":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei","year":2020,"venue":"cs.LG","abstract":"We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are significantly more sample-efficient, such that optimally compute-efficient training involves training very large models on a relatively modest amount of data and stopping significantly before convergence.","external_url":"https://arxiv.org/abs/2001.08361","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T13:03:26.763687+00:00","pith_arxiv_id":"2001.08361","created_at":"2026-05-08T17:13:38.652548+00:00","updated_at":"2026-06-29T13:03:26.763687+00:00","title_quality_ok":true,"display_title":"Scaling Laws for Neural Language Models","render_title":"Scaling Laws for Neural Language Models"},"hub":{"state":{"work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":652,"external_cited_by_count":null,"distinct_field_count":49,"first_pith_cited_at":"2020-06-30T10:42:02+00:00","last_pith_cited_at":"2026-06-24T21:26:43+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T13:08:47.205466+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":119},{"context_role":"method","n":6},{"context_role":"dataset","n":3},{"context_role":"baseline","n":2},{"context_role":"other","n":2}],"polarity_counts":[{"context_polarity":"background","n":110},{"context_polarity":"unclear","n":8},{"context_polarity":"use_method","n":6},{"context_polarity":"support","n":3},{"context_polarity":"use_dataset","n":3},{"context_polarity":"baseline","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Scaling Laws for Neural Language Models","claims":[{"claim_text":"We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are s","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Scaling Laws for Neural Language Models because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:53:29.432125+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"419c50dd-141f-4361-af4e-2e0acde44908","orcid":null,"display_name":"Jared Kaplan"},{"id":"ea96e77c-ce76-4cff-9136-183fe07296f5","orcid":null,"display_name":"Sam McCandlish"},{"id":"5a8540f5-b157-4b1c-a575-3af16bd1b586","orcid":null,"display_name":"Tom Henighan"},{"id":"2702dd0f-1a5a-4597-aaeb-0fc79f30b6af","orcid":null,"display_name":"Tom B Brown"},{"id":"c3c478a4-9607-46f3-96f9-8574fddda354","orcid":null,"display_name":"Benjamin Chess"},{"id":"66e72c4e-d38b-4fd4-905a-8e63558820e2","orcid":null,"display_name":"Rewon Child"}]},"error":null,"updated_at":"2026-05-13T18:53:29.430090+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T18:53:28.682259+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":94},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":53},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":49},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":44},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":44},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":42},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":39},{"title":"On the Opportunities and Risks of Foundation Models","work_id":"a18039e9-928d-47c9-a836-32656a71bf71","shared_citers":30},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":30},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":29},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":28},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":28},{"title":"Deep Learning Scaling is Predictable, Empirically","work_id":"3638ccb4-3a4f-460e-8b6f-867a65922801","shared_citers":27},{"title":"Scaling Language Models: Methods, Analysis & Insights from Training Gopher","work_id":"47ce8be9-e500-407d-af41-ac2d132215eb","shared_citers":27},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":25},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":24},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":24},{"title":"PaLM: Scaling Language Modeling with Pathways","work_id":"a94f3ef7-2c49-4445-93fe-6ec16aafd966","shared_citers":23},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":23},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":21},{"title":"Scaling Laws for Autoregressive Generative Modeling","work_id":"1f180c21-02d6-4b11-9dfc-08d7f0d8fc81","shared_citers":21},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":20},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":20},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":20}],"time_series":[{"n":3,"year":2020},{"n":3,"year":2021},{"n":17,"year":2022},{"n":10,"year":2023},{"n":10,"year":2024},{"n":8,"year":2025},{"n":243,"year":2026}]},"error":null,"updated_at":"2026-05-13T18:53:28.840938+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T18:53:28.022258+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Scaling Laws for Neural Language Models","claims":[{"claim_text":"We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are s","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Scaling Laws for Neural Language Models because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:53:28.687236+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Scaling Laws for Neural Language Models","claims":[{"claim_text":"We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are s","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Scaling Laws for Neural Language Models because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:53:28.685470+00:00"}},"summary":{"title":"Scaling Laws for Neural Language Models","claims":[{"claim_text":"We study empirical scaling laws for language model performance on the cross-entropy loss. The loss scales as a power-law with model size, dataset size, and the amount of compute used for training, with some trends spanning more than seven orders of magnitude. Other architectural details such as network width or depth have minimal effects within a wide range. Simple equations govern the dependence of overfitting on model/dataset size and the dependence of training speed on model size. These relationships allow us to determine the optimal allocation of a fixed compute budget. Larger models are s","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Scaling Laws for Neural Language Models because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":94},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":53},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":49},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":44},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":44},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":42},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":39},{"title":"On the Opportunities and Risks of Foundation Models","work_id":"a18039e9-928d-47c9-a836-32656a71bf71","shared_citers":30},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":30},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":29},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":28},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":28},{"title":"Deep Learning Scaling is Predictable, Empirically","work_id":"3638ccb4-3a4f-460e-8b6f-867a65922801","shared_citers":27},{"title":"Scaling Language Models: Methods, Analysis & Insights from Training Gopher","work_id":"47ce8be9-e500-407d-af41-ac2d132215eb","shared_citers":27},{"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","shared_citers":25},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":24},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":24},{"title":"PaLM: Scaling Language Modeling with Pathways","work_id":"a94f3ef7-2c49-4445-93fe-6ec16aafd966","shared_citers":23},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":23},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":21},{"title":"Scaling Laws for Autoregressive Generative Modeling","work_id":"1f180c21-02d6-4b11-9dfc-08d7f0d8fc81","shared_citers":21},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":20},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":20},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":20}],"time_series":[{"n":3,"year":2020},{"n":3,"year":2021},{"n":17,"year":2022},{"n":10,"year":2023},{"n":10,"year":2024},{"n":8,"year":2025},{"n":243,"year":2026}]},"authors":[{"id":"c3c478a4-9607-46f3-96f9-8574fddda354","orcid":null,"display_name":"Benjamin Chess","source":"manual","import_confidence":0.72},{"id":"419c50dd-141f-4361-af4e-2e0acde44908","orcid":null,"display_name":"Jared Kaplan","source":"manual","import_confidence":0.72},{"id":"66e72c4e-d38b-4fd4-905a-8e63558820e2","orcid":null,"display_name":"Rewon Child","source":"manual","import_confidence":0.72},{"id":"ea96e77c-ce76-4cff-9136-183fe07296f5","orcid":null,"display_name":"Sam McCandlish","source":"manual","import_confidence":0.72},{"id":"2702dd0f-1a5a-4597-aaeb-0fc79f30b6af","orcid":null,"display_name":"Tom B Brown","source":"manual","import_confidence":0.72},{"id":"5a8540f5-b157-4b1c-a575-3af16bd1b586","orcid":null,"display_name":"Tom Henighan","source":"manual","import_confidence":0.72}]}}