{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2022:DPM7NWRDQBDKWACAS324KYME6L","short_pith_number":"pith:DPM7NWRD","canonical_record":{"source":{"id":"2205.10487","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2022-05-21T02:14:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"5a369711a870bc18ae971249f94ed6b0f5346791131e8e2f0ab4be8f4502fb45","abstract_canon_sha256":"1f3ba547302854ee4ff49f5540a368b48db97ee6f792bc5d1b6ce32b750eb0bd"},"schema_version":"1.0"},"canonical_sha256":"1bd9f6da238046ab004096f5c56184f2ee4f9d899bfef8747904d11cde8645ea","source":{"kind":"arxiv","id":"2205.10487","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2205.10487","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2205.10487v1","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2205.10487","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"DPM7NWRDQBDK","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"DPM7NWRDQBDKWACA","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"DPM7NWRD","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2022:DPM7NWRDQBDKWACAS324KYME6L","target":"record","payload":{"canonical_record":{"source":{"id":"2205.10487","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2022-05-21T02:14:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"5a369711a870bc18ae971249f94ed6b0f5346791131e8e2f0ab4be8f4502fb45","abstract_canon_sha256":"1f3ba547302854ee4ff49f5540a368b48db97ee6f792bc5d1b6ce32b750eb0bd"},"schema_version":"1.0"},"canonical_sha256":"1bd9f6da238046ab004096f5c56184f2ee4f9d899bfef8747904d11cde8645ea","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.662197Z","signature_b64":"cBUSLCLsWy8w4gwDbafz6c0TfM6eRN+pUZwlkULco+txld4Y2eAKud5ydxzFvO+ivZr+eAn0WSEshl4w4ijCAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1bd9f6da238046ab004096f5c56184f2ee4f9d899bfef8747904d11cde8645ea","last_reissued_at":"2026-05-17T23:38:13.661649Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.661649Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2205.10487","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vUrL21810a/Z2rY9r381WqY5aca3zAzU7IpGB8TKmPlQFlnvqCl3ff38eTNqVUg5Qyz/ZxUmVddtPo398elmAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:39:47.309290Z"},"content_sha256":"e5ab7b2419e68283b2ca4f7cddce2ada51d3240431b845fae4036b78c5700563","schema_version":"1.0","event_id":"sha256:e5ab7b2419e68283b2ca4f7cddce2ada51d3240431b845fae4036b78c5700563"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2022:DPM7NWRDQBDKWACAS324KYME6L","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Scaling Laws and Interpretability of Learning from Repeated Data","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Repeating 0.1% of training data 100 times makes an 800M model perform like a 400M model","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Ben Mann, Catherine Olsson, Chris Olah, Danny Hernandez, Dario Amodei, Dawn Drain, Jared Kaplan, Nelson Elhage, Nicholas Joseph, Nova DasSarma, Sam McCandlish, Scott Johnston, Sheer El-Showk, Tom Brown, Tom Conerly, Tom Henighan, Tristan Hume, Zac Hatfield-Dodds","submitted_at":"2022-05-21T02:14:27Z","abstract_excerpt":"Recent large language models have been trained on vast datasets, but also often on repeated data, either intentionally for the purpose of upweighting higher quality data, or unintentionally because data deduplication is not perfect and the model is exposed to repeated data at the sentence, paragraph, or document level. Some works have reported substantial negative performance effects of this repeated data. In this paper we attempt to study repeated data systematically and to understand its effects mechanistically. To do this, we train a family of models where most of the data is unique but a s"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Performance of an 800M parameter model can be degraded to that of a 2x smaller model (400M params) by repeating 0.1% of the data 100 times, despite the other 90% of the training tokens remaining unique.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the performance degradation is primarily caused by memorization consuming model capacity rather than by changes in optimization dynamics or other unmeasured factors.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Repeating 0.1% of training data 100 times degrades an 800M parameter model's performance to that of a 400M model by damaging copying mechanisms and induction heads associated with generalization.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Repeating 0.1% of training data 100 times makes an 800M model perform like a 400M model","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"bbd9633165d07cc554f987a4d1349e7934118a7f79c592aff31d90bf4e0d4fb9"},"source":{"id":"2205.10487","kind":"arxiv","version":1},"verdict":{"id":"08b4d2b7-b812-4d6b-8311-8af17ff860f0","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T15:44:32.218364Z","strongest_claim":"Performance of an 800M parameter model can be degraded to that of a 2x smaller model (400M params) by repeating 0.1% of the data 100 times, despite the other 90% of the training tokens remaining unique.","one_line_summary":"Repeating 0.1% of training data 100 times degrades an 800M parameter model's performance to that of a 400M model by damaging copying mechanisms and induction heads associated with generalization.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the performance degradation is primarily caused by memorization consuming model capacity rather than by changes in optimization dynamics or other unmeasured factors.","pith_extraction_headline":"Repeating 0.1% of training data 100 times makes an 800M model perform like a 400M model"},"references":{"count":71,"sample":[{"doi":"10.48550/arxiv.2103.00020","year":2021,"title":"Learning Transferable Visual Models From Natural Language Supervision","work_id":"6de86bb5-27bd-4d5c-8b89-967ebfc52659","ref_index":1,"cited_arxiv_id":"2103.00020","is_internal_anchor":true},{"doi":"10.23915/distill.00030","year":null,"title":"Multimodal neurons in artificial neural networks","work_id":"a5431036-9258-4452-954d-965edf6456ef","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"In-context Learning and Induction Heads , year =","work_id":"e25d4ab0-6097-4d74-841c-db89def7a69b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv.2203.02155","year":2022,"title":"Training language models to follow instructions with human feedback","work_id":"52aff42f-4fa9-4fcf-bdb3-1459b9bebf65","ref_index":4,"cited_arxiv_id":"2203.02155","is_internal_anchor":true},{"doi":"","year":2001,"title":"A Variational Approach to Learning Curves , url =","work_id":"678d0b26-f77f-4a51-afe7-457123410a55","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":71,"snapshot_sha256":"2b7a18c0f29b5483bd7c0752ff88f6abc042c6ddd43edd3a9b96012bcb387920","internal_anchors":18},"formal_canon":{"evidence_count":1,"snapshot_sha256":"2779946cac165857b6d4bb9b1ed990de343ecaba8be29d223f40f9c0bfc49eb1"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"08b4d2b7-b812-4d6b-8311-8af17ff860f0"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ttf7nwBDFPjAO9BUl7ClIoutRxsNLC+I5Doqp3XFmYfr6aWIUPfjb5ToBRIiLeRuCRypZgsoB4tNPu5oJkmgDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:39:47.310347Z"},"content_sha256":"4cb699c4b821b9bd1116ab5e62c922f023b950ef53a9aa2b8a67a2b4bec19c9e","schema_version":"1.0","event_id":"sha256:4cb699c4b821b9bd1116ab5e62c922f023b950ef53a9aa2b8a67a2b4bec19c9e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DPM7NWRDQBDKWACAS324KYME6L/bundle.json","state_url":"https://pith.science/pith/DPM7NWRDQBDKWACAS324KYME6L/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DPM7NWRDQBDKWACAS324KYME6L/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T20:39:47Z","links":{"resolver":"https://pith.science/pith/DPM7NWRDQBDKWACAS324KYME6L","bundle":"https://pith.science/pith/DPM7NWRDQBDKWACAS324KYME6L/bundle.json","state":"https://pith.science/pith/DPM7NWRDQBDKWACAS324KYME6L/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DPM7NWRDQBDKWACAS324KYME6L/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:DPM7NWRDQBDKWACAS324KYME6L","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"1f3ba547302854ee4ff49f5540a368b48db97ee6f792bc5d1b6ce32b750eb0bd","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2022-05-21T02:14:27Z","title_canon_sha256":"5a369711a870bc18ae971249f94ed6b0f5346791131e8e2f0ab4be8f4502fb45"},"schema_version":"1.0","source":{"id":"2205.10487","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2205.10487","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2205.10487v1","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2205.10487","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"DPM7NWRDQBDK","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"DPM7NWRDQBDKWACA","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"DPM7NWRD","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:4cb699c4b821b9bd1116ab5e62c922f023b950ef53a9aa2b8a67a2b4bec19c9e","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Performance of an 800M parameter model can be degraded to that of a 2x smaller model (400M params) by repeating 0.1% of the data 100 times, despite the other 90% of the training tokens remaining unique."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the performance degradation is primarily caused by memorization consuming model capacity rather than by changes in optimization dynamics or other unmeasured factors."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Repeating 0.1% of training data 100 times degrades an 800M parameter model's performance to that of a 400M model by damaging copying mechanisms and induction heads associated with generalization."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Repeating 0.1% of training data 100 times makes an 800M model perform like a 400M model"}],"snapshot_sha256":"bbd9633165d07cc554f987a4d1349e7934118a7f79c592aff31d90bf4e0d4fb9"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"2779946cac165857b6d4bb9b1ed990de343ecaba8be29d223f40f9c0bfc49eb1"},"paper":{"abstract_excerpt":"Recent large language models have been trained on vast datasets, but also often on repeated data, either intentionally for the purpose of upweighting higher quality data, or unintentionally because data deduplication is not perfect and the model is exposed to repeated data at the sentence, paragraph, or document level. Some works have reported substantial negative performance effects of this repeated data. In this paper we attempt to study repeated data systematically and to understand its effects mechanistically. To do this, we train a family of models where most of the data is unique but a s","authors_text":"Ben Mann, Catherine Olsson, Chris Olah, Danny Hernandez, Dario Amodei, Dawn Drain, Jared Kaplan, Nelson Elhage, Nicholas Joseph, Nova DasSarma, Sam McCandlish, Scott Johnston, Sheer El-Showk, Tom Brown, Tom Conerly, Tom Henighan, Tristan Hume, Zac Hatfield-Dodds","cross_cats":["cs.AI"],"headline":"Repeating 0.1% of training data 100 times makes an 800M model perform like a 400M model","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2022-05-21T02:14:27Z","title":"Scaling Laws and Interpretability of Learning from Repeated Data"},"references":{"count":71,"internal_anchors":18,"resolved_work":71,"sample":[{"cited_arxiv_id":"2103.00020","doi":"10.48550/arxiv.2103.00020","is_internal_anchor":true,"ref_index":1,"title":"Learning Transferable Visual Models From Natural Language Supervision","work_id":"6de86bb5-27bd-4d5c-8b89-967ebfc52659","year":2021},{"cited_arxiv_id":"","doi":"10.23915/distill.00030","is_internal_anchor":false,"ref_index":2,"title":"Multimodal neurons in artificial neural networks","work_id":"a5431036-9258-4452-954d-965edf6456ef","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"In-context Learning and Induction Heads , year =","work_id":"e25d4ab0-6097-4d74-841c-db89def7a69b","year":null},{"cited_arxiv_id":"2203.02155","doi":"10.48550/arxiv.2203.02155","is_internal_anchor":true,"ref_index":4,"title":"Training language models to follow instructions with human feedback","work_id":"52aff42f-4fa9-4fcf-bdb3-1459b9bebf65","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"A Variational Approach to Learning Curves , url =","work_id":"678d0b26-f77f-4a51-afe7-457123410a55","year":2001}],"snapshot_sha256":"2b7a18c0f29b5483bd7c0752ff88f6abc042c6ddd43edd3a9b96012bcb387920"},"source":{"id":"2205.10487","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T15:44:32.218364Z","id":"08b4d2b7-b812-4d6b-8311-8af17ff860f0","model_set":{"reader":"grok-4.3"},"one_line_summary":"Repeating 0.1% of training data 100 times degrades an 800M parameter model's performance to that of a 400M model by damaging copying mechanisms and induction heads associated with generalization.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Repeating 0.1% of training data 100 times makes an 800M model perform like a 400M model","strongest_claim":"Performance of an 800M parameter model can be degraded to that of a 2x smaller model (400M params) by repeating 0.1% of the data 100 times, despite the other 90% of the training tokens remaining unique.","weakest_assumption":"That the performance degradation is primarily caused by memorization consuming model capacity rather than by changes in optimization dynamics or other unmeasured factors."}},"verdict_id":"08b4d2b7-b812-4d6b-8311-8af17ff860f0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e5ab7b2419e68283b2ca4f7cddce2ada51d3240431b845fae4036b78c5700563","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"1f3ba547302854ee4ff49f5540a368b48db97ee6f792bc5d1b6ce32b750eb0bd","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2022-05-21T02:14:27Z","title_canon_sha256":"5a369711a870bc18ae971249f94ed6b0f5346791131e8e2f0ab4be8f4502fb45"},"schema_version":"1.0","source":{"id":"2205.10487","kind":"arxiv","version":1}},"canonical_sha256":"1bd9f6da238046ab004096f5c56184f2ee4f9d899bfef8747904d11cde8645ea","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"1bd9f6da238046ab004096f5c56184f2ee4f9d899bfef8747904d11cde8645ea","first_computed_at":"2026-05-17T23:38:13.661649Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.661649Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"cBUSLCLsWy8w4gwDbafz6c0TfM6eRN+pUZwlkULco+txld4Y2eAKud5ydxzFvO+ivZr+eAn0WSEshl4w4ijCAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.662197Z","signed_message":"canonical_sha256_bytes"},"source_id":"2205.10487","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e5ab7b2419e68283b2ca4f7cddce2ada51d3240431b845fae4036b78c5700563","sha256:4cb699c4b821b9bd1116ab5e62c922f023b950ef53a9aa2b8a67a2b4bec19c9e"],"state_sha256":"75ef76bb080758614f43b4196eeb6695e935388871eac2df1d0f5fc0791b998d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"idf0ew4GuOcKPRkRIC6TNd6Lt+VXUzDVmikPpFdrNNNiObNAE7q4Cg4AUvGujuCbJgwhthQkdi4P3JSwP6mWBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T20:39:47.313230Z","bundle_sha256":"bdce663ee282e48605720dbca266a9df2fb817c15d438553e1139c64d88a3b79"}}