{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2013:MFFTLWFCPYMR6NWNQ6HQIIIUWE","short_pith_number":"pith:MFFTLWFC","schema_version":"1.0","canonical_sha256":"614b35d8a27e191f36cd878f042114b1272b38e39f1ce191e6ea3e7c648bddad","source":{"kind":"arxiv","id":"1312.3005","version":3},"attestation_state":"computed","paper":{"title":"One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ciprian Chelba, Mike Schuster, Phillipp Koehn, Qi Ge, Thorsten Brants, Tomas Mikolov, Tony Robinson","submitted_at":"2013-12-11T00:25:57Z","abstract_excerpt":"We propose a new benchmark corpus to be used for measuring progress in statistical language modeling. With almost one billion words of training data, we hope this benchmark will be useful to quickly evaluate novel language modeling techniques, and to compare their contribution when combined with other advanced techniques. We show performance of several well-known types of language models, with the best results achieved with a recurrent neural network based language model. The baseline unpruned Kneser-Ney 5-gram model achieves perplexity 67.6; a combination of techniques leads to 35% reduction "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1312.3005","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2013-12-11T00:25:57Z","cross_cats_sorted":[],"title_canon_sha256":"0ad3f809d8d20dff16199363f88048a11653dae0365a26989d79ada9d00d5ac8","abstract_canon_sha256":"8407298eb3d9b55fde9f35b94e4bed388483ee0bd6a9a7b114f96ecc436d20a3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:57:17.082076Z","signature_b64":"T/5mqyQDr4pGrd41ulAnF+Q5knJRNk6nkvMlX3iVEVPnKSNyqeI1li9/Vt0fPQ8kSK9hbVmIjX5JOCt0l85MAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"614b35d8a27e191f36cd878f042114b1272b38e39f1ce191e6ea3e7c648bddad","last_reissued_at":"2026-05-18T02:57:17.081548Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:57:17.081548Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ciprian Chelba, Mike Schuster, Phillipp Koehn, Qi Ge, Thorsten Brants, Tomas Mikolov, Tony Robinson","submitted_at":"2013-12-11T00:25:57Z","abstract_excerpt":"We propose a new benchmark corpus to be used for measuring progress in statistical language modeling. With almost one billion words of training data, we hope this benchmark will be useful to quickly evaluate novel language modeling techniques, and to compare their contribution when combined with other advanced techniques. We show performance of several well-known types of language models, with the best results achieved with a recurrent neural network based language model. The baseline unpruned Kneser-Ney 5-gram model achieves perplexity 67.6; a combination of techniques leads to 35% reduction "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1312.3005","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1312.3005","created_at":"2026-05-18T02:57:17.081625+00:00"},{"alias_kind":"arxiv_version","alias_value":"1312.3005v3","created_at":"2026-05-18T02:57:17.081625+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1312.3005","created_at":"2026-05-18T02:57:17.081625+00:00"},{"alias_kind":"pith_short_12","alias_value":"MFFTLWFCPYMR","created_at":"2026-05-18T12:27:52.871228+00:00"},{"alias_kind":"pith_short_16","alias_value":"MFFTLWFCPYMR6NWN","created_at":"2026-05-18T12:27:52.871228+00:00"},{"alias_kind":"pith_short_8","alias_value":"MFFTLWFC","created_at":"2026-05-18T12:27:52.871228+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":31,"internal_anchor_count":10,"sample":[{"citing_arxiv_id":"2107.06499","citing_title":"Deduplicating Training Data Makes Language Models Better","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22765","citing_title":"Uniform Diffusion Models Revisited: Leave-One-Out Denoiser and Absorbing State Reformulation","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2602.16813","citing_title":"Flow Map Language Models: One-step Language Modeling via Continuous Denoising","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15676","citing_title":"Dynamic Chunking for Diffusion Language Models","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2505.17384","citing_title":"Variational Autoencoding Discrete Diffusion with Enhanced Dimensional Correlations Modeling","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"1911.05507","citing_title":"Compressive Transformers for Long-Range Sequence Modelling","ref_index":122,"is_internal_anchor":true},{"citing_arxiv_id":"2510.03206","citing_title":"Coevolutionary Continuous Discrete Diffusion: Make Your Diffusion Language Model a Latent Reasoner","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":147,"is_internal_anchor":true},{"citing_arxiv_id":"2311.16867","citing_title":"The Falcon Series of Open Language Models","ref_index":257,"is_internal_anchor":true},{"citing_arxiv_id":"2602.16813","citing_title":"Flow Map Language Models: One-step Language Modeling via Continuous Denoising","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2306.01116","citing_title":"The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data, and Web Data Only","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2604.02718","citing_title":"Generative Frontiers: Why Evaluation Matters for Diffusion Language Models","ref_index":4,"is_internal_anchor":false},{"citing_arxiv_id":"2605.12395","citing_title":"A Comparative Study of Controlled Text Generation Systems Using Level-Playing-Field Evaluation Principles","ref_index":8,"is_internal_anchor":false},{"citing_arxiv_id":"2605.12345","citing_title":"Output Composability of QLoRA PEFT Modules for Plug-and-Play Attribute-Controlled Text Generation","ref_index":33,"is_internal_anchor":false},{"citing_arxiv_id":"1804.07461","citing_title":"GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding","ref_index":6,"is_internal_anchor":false},{"citing_arxiv_id":"2604.26841","citing_title":"Language Diffusion Models are Associative Memories Capable of Retrieving Unseen Data","ref_index":45,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10518","citing_title":"Infinite Mask Diffusion for Few-Step Distillation","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"1712.00409","citing_title":"Deep Learning Scaling is Predictable, Empirically","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05629","citing_title":"Spherical Flows for Sampling Categorical Data","ref_index":78,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05629","citing_title":"Spherical Flows for Sampling Categorical Data","ref_index":78,"is_internal_anchor":false},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":189,"is_internal_anchor":false},{"citing_arxiv_id":"2604.11748","citing_title":"LangFlow: Continuous Diffusion Rivals Discrete in Language Modeling","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2604.10627","citing_title":"Computational Lesions in Multilingual Language Models Separate Shared and Language-specific Brain Alignment","ref_index":40,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06982","citing_title":"FastOmniTMAE: Parallel Clause Learning for Scalable and Hardware-Efficient Tsetlin Embeddings","ref_index":7,"is_internal_anchor":false},{"citing_arxiv_id":"1609.07843","citing_title":"Pointer Sentinel Mixture Models","ref_index":3,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE","json":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE.json","graph_json":"https://pith.science/api/pith-number/MFFTLWFCPYMR6NWNQ6HQIIIUWE/graph.json","events_json":"https://pith.science/api/pith-number/MFFTLWFCPYMR6NWNQ6HQIIIUWE/events.json","paper":"https://pith.science/paper/MFFTLWFC"},"agent_actions":{"view_html":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE","download_json":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE.json","view_paper":"https://pith.science/paper/MFFTLWFC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1312.3005&json=true","fetch_graph":"https://pith.science/api/pith-number/MFFTLWFCPYMR6NWNQ6HQIIIUWE/graph.json","fetch_events":"https://pith.science/api/pith-number/MFFTLWFCPYMR6NWNQ6HQIIIUWE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE/action/storage_attestation","attest_author":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE/action/author_attestation","sign_citation":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE/action/citation_signature","submit_replication":"https://pith.science/pith/MFFTLWFCPYMR6NWNQ6HQIIIUWE/action/replication_record"}},"created_at":"2026-05-18T02:57:17.081625+00:00","updated_at":"2026-05-18T02:57:17.081625+00:00"}