{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2021:KGI2ZLF7ZMMXRYLBTOH6PQCWUT","short_pith_number":"pith:KGI2ZLF7","canonical_record":{"source":{"id":"2107.06499","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2ee397c3ba5a6e5d7aadc17436cabb1d4899b7de2fe0ecb890cd0bf0ea793cda","abstract_canon_sha256":"7af11173ac89854276468641ec4e6cf0cad4ed17e8d224c3edae859a8bf1dec4"},"schema_version":"1.0"},"canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","source":{"kind":"arxiv","id":"2107.06499","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2107.06499","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2107.06499v2","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2107.06499","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"KGI2ZLF7ZMMX","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"KGI2ZLF7ZMMXRYLB","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"KGI2ZLF7","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2021:KGI2ZLF7ZMMXRYLBTOH6PQCWUT","target":"record","payload":{"canonical_record":{"source":{"id":"2107.06499","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2ee397c3ba5a6e5d7aadc17436cabb1d4899b7de2fe0ecb890cd0bf0ea793cda","abstract_canon_sha256":"7af11173ac89854276468641ec4e6cf0cad4ed17e8d224c3edae859a8bf1dec4"},"schema_version":"1.0"},"canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.002819Z","signature_b64":"EuKgcA0fxqweVP/PX/MEBjkF0hjPiDJ//GjiYMeuPj11tKyTMvO0mFPmc5whzFtUvOpqg1rRW48MRWp7g7sHDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","last_reissued_at":"2026-05-17T23:38:14.002097Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.002097Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2107.06499","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"g7mD+nhmxQ8DybJ3IDJnlNADi1hfIQ2eWpP6FsTHbpKLTlVE9IfFCvyhz8q9uF6t+i8An6VwwJqMEnkbYMxnBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T22:17:45.587177Z"},"content_sha256":"0187c9e4271b6bbee7ceefac6288154473a27014d04fd7138a4c78802abb9665","schema_version":"1.0","event_id":"sha256:0187c9e4271b6bbee7ceefac6288154473a27014d04fd7138a4c78802abb9665"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2021:KGI2ZLF7ZMMXRYLBTOH6PQCWUT","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Deduplicating Training Data Makes Language Models Better","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Andrew Nystrom, Chiyuan Zhang, Chris Callison-Burch, Daphne Ippolito, Douglas Eck, Katherine Lee, Nicholas Carlini","submitted_at":"2021-07-14T06:06:52Z","abstract_excerpt":"We find that existing language modeling datasets contain many near-duplicate examples and long repetitive substrings. As a result, over 1% of the unprompted output of language models trained on these datasets is copied verbatim from the training data. We develop two tools that allow us to deduplicate training datasets -- for example removing from C4 a single 61 word English sentence that is repeated over 60,000 times. Deduplication allows us to train models that emit memorized text ten times less frequently and require fewer train steps to achieve the same or better accuracy. We can also reduc"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2107.06499","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"v78fbs6t3o5qyOtT1lRzoHnXGJ6x2Cs21xbcTScc2ObzwDw9BmdCDHPK71EZ61B/+Q6y3cVVTh8VnqUhI8uGCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T22:17:45.587903Z"},"content_sha256":"f973db280781b0e285460ce4d1443713de6e3cb00fa66eee1f1e38996e6e83f6","schema_version":"1.0","event_id":"sha256:f973db280781b0e285460ce4d1443713de6e3cb00fa66eee1f1e38996e6e83f6"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/bundle.json","state_url":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T22:17:45Z","links":{"resolver":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT","bundle":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/bundle.json","state":"https://pith.science/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/state.json","well_known_bundle":"https://pith.science/.well-known/pith/KGI2ZLF7ZMMXRYLBTOH6PQCWUT/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:KGI2ZLF7ZMMXRYLBTOH6PQCWUT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7af11173ac89854276468641ec4e6cf0cad4ed17e8d224c3edae859a8bf1dec4","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52Z","title_canon_sha256":"2ee397c3ba5a6e5d7aadc17436cabb1d4899b7de2fe0ecb890cd0bf0ea793cda"},"schema_version":"1.0","source":{"id":"2107.06499","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2107.06499","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2107.06499v2","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2107.06499","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"KGI2ZLF7ZMMX","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"KGI2ZLF7ZMMXRYLB","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"KGI2ZLF7","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:f973db280781b0e285460ce4d1443713de6e3cb00fa66eee1f1e38996e6e83f6","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We find that existing language modeling datasets contain many near-duplicate examples and long repetitive substrings. As a result, over 1% of the unprompted output of language models trained on these datasets is copied verbatim from the training data. We develop two tools that allow us to deduplicate training datasets -- for example removing from C4 a single 61 word English sentence that is repeated over 60,000 times. Deduplication allows us to train models that emit memorized text ten times less frequently and require fewer train steps to achieve the same or better accuracy. We can also reduc","authors_text":"Andrew Nystrom, Chiyuan Zhang, Chris Callison-Burch, Daphne Ippolito, Douglas Eck, Katherine Lee, Nicholas Carlini","cross_cats":["cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52Z","title":"Deduplicating Training Data Makes Language Models Better"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2107.06499","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0187c9e4271b6bbee7ceefac6288154473a27014d04fd7138a4c78802abb9665","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7af11173ac89854276468641ec4e6cf0cad4ed17e8d224c3edae859a8bf1dec4","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-07-14T06:06:52Z","title_canon_sha256":"2ee397c3ba5a6e5d7aadc17436cabb1d4899b7de2fe0ecb890cd0bf0ea793cda"},"schema_version":"1.0","source":{"id":"2107.06499","kind":"arxiv","version":2}},"canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5191acacbfcb1978e1619b8fe7c056a4ed7163c761dcabc736c69c01e6781c98","first_computed_at":"2026-05-17T23:38:14.002097Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.002097Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"EuKgcA0fxqweVP/PX/MEBjkF0hjPiDJ//GjiYMeuPj11tKyTMvO0mFPmc5whzFtUvOpqg1rRW48MRWp7g7sHDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.002819Z","signed_message":"canonical_sha256_bytes"},"source_id":"2107.06499","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0187c9e4271b6bbee7ceefac6288154473a27014d04fd7138a4c78802abb9665","sha256:f973db280781b0e285460ce4d1443713de6e3cb00fa66eee1f1e38996e6e83f6"],"state_sha256":"9d56bc8526a7575637f2dbb6375c88ca7456712d76f1e936378171bdda5d1068"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"N83hFSsdlp5DsyVNkK8cWQIgPRZyaIoV2IkA1tmj6dUi1U2YwgTbLfe85y/3NVsJyCCOLFsIZyyFM0HICvxuAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T22:17:45.590050Z","bundle_sha256":"2dada5a73b31d2370b2db55f02d8f894264f28a45ed0d14a7ac4c4ae7632cb5a"}}