{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:6RZPVFJ335EWZPPTC5B43ZUSLU","short_pith_number":"pith:6RZPVFJ3","schema_version":"1.0","canonical_sha256":"f472fa953bdf496cbdf31743cde6925d39e469d0e09d01b00a3652fb0f2cf051","source":{"kind":"arxiv","id":"2305.07759","version":2},"attestation_state":"computed","paper":{"title":"TinyStories: How Small Can Language Models Be and Still Speak Coherent English?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Ronen Eldan, Yuanzhi Li","submitted_at":"2023-05-12T20:56:48Z","abstract_excerpt":"Language models (LMs) are powerful tools for natural language processing, but they often struggle to produce coherent and fluent text when they are small. Models with around 125M parameters such as GPT-Neo (small) or GPT-2 (small) can rarely generate coherent and consistent English text beyond a few words even after extensive training. This raises the question of whether the emergence of the ability to produce coherent English text only occurs at larger scales (with hundreds of millions of parameters or more) and complex architectures (with many layers of global attention).\n  In this work, we "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2305.07759","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-05-12T20:56:48Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"ff9c50c66065cea5843ece312490ddab6699209bb3628d5d6cbf5e8bc0eb9aae","abstract_canon_sha256":"020a36f7f774acb6609a785cbd7008c46ff781ebe03212c7e346b808af1e687c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-25T07:32:56.713443Z","signature_b64":"7AZWrSnwwxVSNOkxkqu0vvwnp9YO2XzQ+rwHAJqyIKDlF3LkmfWSqZVXe2OvzPt9hw7QTQmlD0AtCWDDUGmrCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f472fa953bdf496cbdf31743cde6925d39e469d0e09d01b00a3652fb0f2cf051","last_reissued_at":"2026-05-25T07:32:56.710176Z","signature_status":"signed_v1","first_computed_at":"2026-05-25T07:32:56.710176Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"TinyStories: How Small Can Language Models Be and Still Speak Coherent English?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Ronen Eldan, Yuanzhi Li","submitted_at":"2023-05-12T20:56:48Z","abstract_excerpt":"Language models (LMs) are powerful tools for natural language processing, but they often struggle to produce coherent and fluent text when they are small. Models with around 125M parameters such as GPT-Neo (small) or GPT-2 (small) can rarely generate coherent and consistent English text beyond a few words even after extensive training. This raises the question of whether the emergence of the ability to produce coherent English text only occurs at larger scales (with hundreds of millions of parameters or more) and complex architectures (with many layers of global attention).\n  In this work, we "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2305.07759","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2305.07759/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2305.07759","created_at":"2026-05-25T07:32:56.710338+00:00"},{"alias_kind":"arxiv_version","alias_value":"2305.07759v2","created_at":"2026-05-25T07:32:56.710338+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.07759","created_at":"2026-05-25T07:32:56.710338+00:00"},{"alias_kind":"pith_short_12","alias_value":"6RZPVFJ335EW","created_at":"2026-05-25T07:32:56.710338+00:00"},{"alias_kind":"pith_short_16","alias_value":"6RZPVFJ335EWZPPT","created_at":"2026-05-25T07:32:56.710338+00:00"},{"alias_kind":"pith_short_8","alias_value":"6RZPVFJ3","created_at":"2026-05-25T07:32:56.710338+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":24,"internal_anchor_count":24,"sample":[{"citing_arxiv_id":"2511.05963","citing_title":"Next-Latent Prediction Transformers Learn Compact World Models","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2501.05465","citing_title":"Small Language Models (SLMs) Can Still Pack a Punch: A survey (updated 2026)","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2504.20605","citing_title":"TF1-EN-3M: Three Million Synthetic Moral Fables for Training Small, Open Language Models","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2505.24333","citing_title":"Two failure modes of deep transformers and how to avoid them: a unified theory of signal propagation at initialisation","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17064","citing_title":"Towards Human-Level Book-Writing Capability","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2506.15461","citing_title":"All is Not Lost: LLM Recovery without Checkpoints","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2509.26404","citing_title":"SeedPrints: Fingerprints Can Even Tell Which Seed Your Large Language Model Was Trained From","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2510.04008","citing_title":"RACE Attention: A Strictly Linear-Time Attention Layer for Training on Outrageously Large Contexts","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2510.04686","citing_title":"How does the optimizer implicitly bias the model merging loss landscape?","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2601.00860","citing_title":"Path Integral Solution for Dissipative Generative Dynamics","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2601.19208","citing_title":"How Do Transformers Learn to Associate Tokens: Gradient Leading Terms Bring Mechanistic Interpretability","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12529","citing_title":"BackFlush: Knowledge-Free Backdoor Detection and Elimination with Watermark Preservation in Large Language Models","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12522","citing_title":"Differences in Text Generated by Diffusion and Autoregressive Language Models","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2309.05463","citing_title":"Textbooks Are All You Need II: phi-1.5 technical report","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12888","citing_title":"Seed Bank, Co-op, Stoop Swap: Metaphors for Governing Language Model Data for Creative Writing","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2309.12284","citing_title":"MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2306.11644","citing_title":"Textbooks Are All You Need","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26505","citing_title":"Quantamination: Dynamic Quantization Leaks Your Data Across the Batch","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09749","citing_title":"Primal-Dual Guided Decoding for Constrained Discrete Diffusion","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09189","citing_title":"Practical Scaling Laws: Converting Compute into Performance in a Data-Constrained World","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12493","citing_title":"Latent Planning Emerges with Scale","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07748","citing_title":"TextLDM: Language Modeling with Continuous Latent Diffusion","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15694","citing_title":"Neural Continuous-Time Markov Chain: Discrete Diffusion via Decoupled Jump Timing and Direction","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15694","citing_title":"Neural Continuous-Time Markov Chain: Discrete Diffusion via Decoupled Jump Timing and Direction","ref_index":2,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU","json":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU.json","graph_json":"https://pith.science/api/pith-number/6RZPVFJ335EWZPPTC5B43ZUSLU/graph.json","events_json":"https://pith.science/api/pith-number/6RZPVFJ335EWZPPTC5B43ZUSLU/events.json","paper":"https://pith.science/paper/6RZPVFJ3"},"agent_actions":{"view_html":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU","download_json":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU.json","view_paper":"https://pith.science/paper/6RZPVFJ3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2305.07759&json=true","fetch_graph":"https://pith.science/api/pith-number/6RZPVFJ335EWZPPTC5B43ZUSLU/graph.json","fetch_events":"https://pith.science/api/pith-number/6RZPVFJ335EWZPPTC5B43ZUSLU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU/action/storage_attestation","attest_author":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU/action/author_attestation","sign_citation":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU/action/citation_signature","submit_replication":"https://pith.science/pith/6RZPVFJ335EWZPPTC5B43ZUSLU/action/replication_record"}},"created_at":"2026-05-25T07:32:56.710338+00:00","updated_at":"2026-05-25T07:32:56.710338+00:00"}