{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:7RK4LJX5VT4FOV77TQ73QHA6IV","short_pith_number":"pith:7RK4LJX5","schema_version":"1.0","canonical_sha256":"fc55c5a6fdacf85757ff9c3fb81c1e4557229d50d9e89b252437e279f2316df7","source":{"kind":"arxiv","id":"2305.16264","version":5},"attestation_state":"computed","paper":{"title":"Scaling Data-Constrained Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Aleksandra Piktus, Alexander M. Rush, Boaz Barak, Colin Raffel, Niklas Muennighoff, Nouamane Tazi, Sampo Pyysalo, Teven Le Scao, Thomas Wolf","submitted_at":"2023-05-25T17:18:55Z","abstract_excerpt":"The current trend of scaling language models involves increasing both parameter count and training dataset size. Extrapolating this trend suggests that training dataset size may soon be limited by the amount of text data available on the internet. Motivated by this limit, we investigate scaling language models in data-constrained regimes. Specifically, we run a large set of experiments varying the extent of data repetition and compute budget, ranging up to 900 billion training tokens and 9 billion parameter models. We find that with constrained data for a fixed compute budget, training with up"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2305.16264","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-05-25T17:18:55Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"4bb79939be3d13191ce191db5fb88d405000b0881c6ad6a355846eb2d79ffb43","abstract_canon_sha256":"484651e73b99755c79f90a81d937e61fb2b9323ef71c3ce8becfb0fa5d01f3b2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:29:36.264079Z","signature_b64":"OMz/P2AyCXPpBX6Jhuj3cddL26mks8qXSIhfIuAvzI7rwjhIvJEQWGn4RmEgCXokxoR2PwTU/Z26NPTG23IBDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fc55c5a6fdacf85757ff9c3fb81c1e4557229d50d9e89b252437e279f2316df7","last_reissued_at":"2026-05-18T01:29:36.263475Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:29:36.263475Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Scaling Data-Constrained Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Aleksandra Piktus, Alexander M. Rush, Boaz Barak, Colin Raffel, Niklas Muennighoff, Nouamane Tazi, Sampo Pyysalo, Teven Le Scao, Thomas Wolf","submitted_at":"2023-05-25T17:18:55Z","abstract_excerpt":"The current trend of scaling language models involves increasing both parameter count and training dataset size. Extrapolating this trend suggests that training dataset size may soon be limited by the amount of text data available on the internet. Motivated by this limit, we investigate scaling language models in data-constrained regimes. Specifically, we run a large set of experiments varying the extent of data repetition and compute budget, ranging up to 900 billion training tokens and 9 billion parameter models. We find that with constrained data for a fixed compute budget, training with up"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2305.16264","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2305.16264","created_at":"2026-05-18T01:29:36.263562+00:00"},{"alias_kind":"arxiv_version","alias_value":"2305.16264v5","created_at":"2026-05-18T01:29:36.263562+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.16264","created_at":"2026-05-18T01:29:36.263562+00:00"},{"alias_kind":"pith_short_12","alias_value":"7RK4LJX5VT4F","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"7RK4LJX5VT4FOV77","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"7RK4LJX5","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":18,"internal_anchor_count":18,"sample":[{"citing_arxiv_id":"2510.18900","citing_title":"Foundation Models for Discovery and Exploration in Chemical Space","ref_index":133,"is_internal_anchor":true},{"citing_arxiv_id":"2406.11794","citing_title":"DataComp-LM: In search of the next generation of training sets for language models","ref_index":130,"is_internal_anchor":true},{"citing_arxiv_id":"2402.00838","citing_title":"OLMo: Accelerating the Science of Language Models","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2510.13786","citing_title":"The Art of Scaling Reinforcement Learning Compute for LLMs","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2311.16867","citing_title":"The Falcon Series of Open Language Models","ref_index":139,"is_internal_anchor":true},{"citing_arxiv_id":"2311.17035","citing_title":"Scalable Extraction of Training Data from (Production) Language Models","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13225","citing_title":"Mix, Don't Tune: Bilingual Pre-Training Outperforms Hyperparameter Search in Data-Constrained Settings","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2309.07597","citing_title":"C-Pack: Packed Resources For General Chinese Embeddings","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2305.13048","citing_title":"RWKV: Reinventing RNNs for the Transformer Era","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2306.11644","citing_title":"Textbooks Are All You Need","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19835","citing_title":"Expert Upcycling: Shifting the Compute-Efficient Frontier of Mixture-of-Experts","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20996","citing_title":"AFRILANGTUTOR: Advancing Language Tutoring and Culture Education in Low-Resource Languages with Large Language Models","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17952","citing_title":"Causal inference for social network formation","ref_index":83,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09389","citing_title":"Is More Data Worth the Cost? Dataset Scaling Laws in a Tiny Attention-Only Decoder","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2303.18223","citing_title":"A Survey of Large Language Models","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2404.14219","citing_title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16979","citing_title":"DOSE: Data Selection for Multi-Modal LLMs via Off-the-Shelf Models","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19835","citing_title":"Expert Upcycling: Shifting the Compute-Efficient Frontier of Mixture-of-Experts","ref_index":38,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV","json":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV.json","graph_json":"https://pith.science/api/pith-number/7RK4LJX5VT4FOV77TQ73QHA6IV/graph.json","events_json":"https://pith.science/api/pith-number/7RK4LJX5VT4FOV77TQ73QHA6IV/events.json","paper":"https://pith.science/paper/7RK4LJX5"},"agent_actions":{"view_html":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV","download_json":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV.json","view_paper":"https://pith.science/paper/7RK4LJX5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2305.16264&json=true","fetch_graph":"https://pith.science/api/pith-number/7RK4LJX5VT4FOV77TQ73QHA6IV/graph.json","fetch_events":"https://pith.science/api/pith-number/7RK4LJX5VT4FOV77TQ73QHA6IV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV/action/storage_attestation","attest_author":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV/action/author_attestation","sign_citation":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV/action/citation_signature","submit_replication":"https://pith.science/pith/7RK4LJX5VT4FOV77TQ73QHA6IV/action/replication_record"}},"created_at":"2026-05-18T01:29:36.263562+00:00","updated_at":"2026-05-18T01:29:36.263562+00:00"}