{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:KGI2ZLF7ZMMXRYLBTOH6PQCWUT","short_pith_number":"pith:KGI2ZLF7","schema_version":"1.0","canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","source":{"kind":"arxiv","id":"2107.06499","version":2},"attestation_state":"computed","paper":{"title":"Deduplicating Training Data Makes Language Models Better","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Andrew Nystrom, Chiyuan Zhang, Chris Callison-Burch, Daphne Ippolito, Douglas Eck, Katherine Lee, Nicholas Carlini","submitted_at":"2021-07-14T06:06:52Z","abstract_excerpt":"We find that existing language modeling datasets contain many near-duplicate examples and long repetitive substrings. As a result, over 1% of the unprompted output of language models trained on these datasets is copied verbatim from the training data. We develop two tools that allow us to deduplicate training datasets -- for example removing from C4 a single 61 word English sentence that is repeated over 60,000 times. Deduplication allows us to train models that emit memorized text ten times less frequently and require fewer train steps to achieve the same or better accuracy. We can also reduc"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2107.06499","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2ee397c3ba5a6e5d7aadc17436cabb1d4899b7de2fe0ecb890cd0bf0ea793cda","abstract_canon_sha256":"7af11173ac89854276468641ec4e6cf0cad4ed17e8d224c3edae859a8bf1dec4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.002819Z","signature_b64":"EuKgcA0fxqweVP/PX/MEBjkF0hjPiDJ//GjiYMeuPj11tKyTMvO0mFPmc5whzFtUvOpqg1rRW48MRWp7g7sHDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","last_reissued_at":"2026-05-17T23:38:14.002097Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.002097Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Deduplicating Training Data Makes Language Models Better","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Andrew Nystrom, Chiyuan Zhang, Chris Callison-Burch, Daphne Ippolito, Douglas Eck, Katherine Lee, Nicholas Carlini","submitted_at":"2021-07-14T06:06:52Z","abstract_excerpt":"We find that existing language modeling datasets contain many near-duplicate examples and long repetitive substrings. As a result, over 1% of the unprompted output of language models trained on these datasets is copied verbatim from the training data. We develop two tools that allow us to deduplicate training datasets -- for example removing from C4 a single 61 word English sentence that is repeated over 60,000 times. Deduplication allows us to train models that emit memorized text ten times less frequently and require fewer train steps to achieve the same or better accuracy. We can also reduc"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2107.06499","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2107.06499","created_at":"2026-05-17T23:38:14.002207+00:00"},{"alias_kind":"arxiv_version","alias_value":"2107.06499v2","created_at":"2026-05-17T23:38:14.002207+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2107.06499","created_at":"2026-05-17T23:38:14.002207+00:00"},{"alias_kind":"pith_short_12","alias_value":"KGI2ZLF7ZMMX","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"KGI2ZLF7ZMMXRYLB","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"KGI2ZLF7","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":25,"internal_anchor_count":8,"sample":[{"citing_arxiv_id":"2509.07177","citing_title":"Towards EnergyGPT: A Large Language Model Specialized for the Energy Sector","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2510.03247","citing_title":"Towards Multimodal Active Learning: Efficient Learning with Limited Paired Data","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2305.15717","citing_title":"The False Promise of Imitating Proprietary LLMs","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2303.09540","citing_title":"SemDeDup: Data-efficient learning at web-scale through semantic deduplication","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2305.16264","citing_title":"Scaling Data-Constrained Language Models","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2308.05374","citing_title":"Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language Models' Alignment","ref_index":193,"is_internal_anchor":true},{"citing_arxiv_id":"2205.10487","citing_title":"Scaling Laws and Interpretability of Learning from Repeated Data","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2406.10162","citing_title":"Sycophancy to Subterfuge: Investigating Reward-Tampering in Large Language Models","ref_index":215,"is_internal_anchor":true},{"citing_arxiv_id":"2112.04426","citing_title":"Improving language models by retrieving from trillions of tokens","ref_index":37,"is_internal_anchor":false},{"citing_arxiv_id":"2304.01373","citing_title":"Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling","ref_index":227,"is_internal_anchor":false},{"citing_arxiv_id":"2110.08207","citing_title":"Multitask Prompted Training Enables Zero-Shot Task Generalization","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2202.07646","citing_title":"Quantifying Memorization Across Neural Language Models","ref_index":14,"is_internal_anchor":false},{"citing_arxiv_id":"2505.13211","citing_title":"MAGI-1: Autoregressive Video Generation at Scale","ref_index":24,"is_internal_anchor":false},{"citing_arxiv_id":"2101.03961","citing_title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","ref_index":19,"is_internal_anchor":false},{"citing_arxiv_id":"2305.10403","citing_title":"PaLM 2 Technical Report","ref_index":165,"is_internal_anchor":false},{"citing_arxiv_id":"2403.03206","citing_title":"Scaling Rectified Flow Transformers for High-Resolution Image Synthesis","ref_index":148,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09990","citing_title":"Merlin: Deterministic Byte-Exact Deduplication for Lossless Context Optimization in Large Language Model Inference","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09611","citing_title":"Byte-Exact Deduplication in Retrieval-Augmented Generation: A Three-Regime Empirical Analysis Across Public Benchmarks","ref_index":9,"is_internal_anchor":false},{"citing_arxiv_id":"2604.23471","citing_title":"Can Humans Detect AI? Mining Textual Signals of AI-Assisted Writing Under Varying Scrutiny Conditions","ref_index":12,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05267","citing_title":"Bridging Generation and Training: A Systematic Review of Quality Issues in LLMs for Code","ref_index":59,"is_internal_anchor":false},{"citing_arxiv_id":"2206.07682","citing_title":"Emergent Abilities of Large Language Models","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"2401.02954","citing_title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","ref_index":153,"is_internal_anchor":false},{"citing_arxiv_id":"2405.04434","citing_title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","ref_index":151,"is_internal_anchor":false},{"citing_arxiv_id":"2204.02311","citing_title":"PaLM: Scaling Language Modeling with Pathways","ref_index":82,"is_internal_anchor":false},{"citing_arxiv_id":"2305.06161","citing_title":"StarCoder: may the source be with you!","ref_index":156,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT","json":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT.json","graph_json":"https://pith.science/api/pith-number/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/graph.json","events_json":"https://pith.science/api/pith-number/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/events.json","paper":"https://pith.science/paper/KGI2ZLF7"},"agent_actions":{"view_html":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT","download_json":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT.json","view_paper":"https://pith.science/paper/KGI2ZLF7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2107.06499&json=true","fetch_graph":"https://pith.science/api/pith-number/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/graph.json","fetch_events":"https://pith.science/api/pith-number/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/action/storage_attestation","attest_author":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/action/author_attestation","sign_citation":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/action/citation_signature","submit_replication":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/action/replication_record"}},"created_at":"2026-05-17T23:38:14.002207+00:00","updated_at":"2026-05-17T23:38:14.002207+00:00"}