{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2022:X37K2SZTINTS73PEDCULW2JAAQ","short_pith_number":"pith:X37K2SZT","canonical_record":{"source":{"id":"2202.07646","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-02-15T18:48:31Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"b45340bfac6354acfb42d31e8b2975ff2334898c8cb111664e76c5a5e77fe631","abstract_canon_sha256":"b5230ec6f01517894ba4f3fdb0c814e278571fab1daf3260f6f00ccb1142f847"},"schema_version":"1.0"},"canonical_sha256":"befead4b3343672fede418a8bb69200411c9639ef98b50f57d7ebcd975c9de30","source":{"kind":"arxiv","id":"2202.07646","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2202.07646","created_at":"2026-05-18T04:38:57Z"},{"alias_kind":"arxiv_version","alias_value":"2202.07646v3","created_at":"2026-05-18T04:38:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2202.07646","created_at":"2026-05-18T04:38:57Z"},{"alias_kind":"pith_short_12","alias_value":"X37K2SZTINTS","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"X37K2SZTINTS73PE","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"X37K2SZT","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2022:X37K2SZTINTS73PEDCULW2JAAQ","target":"record","payload":{"canonical_record":{"source":{"id":"2202.07646","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-02-15T18:48:31Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"b45340bfac6354acfb42d31e8b2975ff2334898c8cb111664e76c5a5e77fe631","abstract_canon_sha256":"b5230ec6f01517894ba4f3fdb0c814e278571fab1daf3260f6f00ccb1142f847"},"schema_version":"1.0"},"canonical_sha256":"befead4b3343672fede418a8bb69200411c9639ef98b50f57d7ebcd975c9de30","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T04:38:57.963852Z","signature_b64":"ctVAtXcr916BzpCQL8WGyAV+fR5HT0AYcl0vMiiwa8Ax2j/rx2FC+37LWsNX/w3Q5AviQ/PieHjynjgSG6svAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"befead4b3343672fede418a8bb69200411c9639ef98b50f57d7ebcd975c9de30","last_reissued_at":"2026-05-18T04:38:57.963093Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T04:38:57.963093Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2202.07646","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T04:38:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"wkR5Xe8QBZ14Y6aOOnfr4pgPb9W/XeFcBpHGwKB87VwYBS42ZCBnePdnXhBA46uwyu3TKCJnx09gbiD+eCMOAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T07:38:53.768974Z"},"content_sha256":"fafd0df0647a8032935337f5af664e30d8e387aa19d0253fcbf841c84302d815","schema_version":"1.0","event_id":"sha256:fafd0df0647a8032935337f5af664e30d8e387aa19d0253fcbf841c84302d815"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2022:X37K2SZTINTS73PEDCULW2JAAQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Quantifying Memorization Across Neural Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Memorization in language models increases log-linearly with model size, data duplication, and prompt length.","cross_cats":["cs.CL"],"primary_cat":"cs.LG","authors_text":"Chiyuan Zhang, Daphne Ippolito, Florian Tramer, Katherine Lee, Matthew Jagielski, Nicholas Carlini","submitted_at":"2022-02-15T18:48:31Z","abstract_excerpt":"Large language models (LMs) have been shown to memorize parts of their training data, and when prompted appropriately, they will emit the memorized training data verbatim. This is undesirable because memorization violates privacy (exposing user data), degrades utility (repeated easy-to-memorize text is often low quality), and hurts fairness (some texts are memorized over others).\n  We describe three log-linear relationships that quantify the degree to which LMs emit memorized training data. Memorization significantly grows as we increase (1) the capacity of a model, (2) the number of times an "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We describe three log-linear relationships that quantify the degree to which LMs emit memorized training data. Memorization significantly grows as we increase (1) the capacity of a model, (2) the number of times an example has been duplicated, and (3) the number of tokens of context used to prompt the model.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That verbatim emission under the chosen prompting and matching criteria accurately captures the privacy, utility, and fairness harms, and that the log-linear trends will continue to hold at larger scales without additional confounding factors.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Memorization in language models increases log-linearly with model capacity, data duplication count, and prompt context length.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Memorization in language models increases log-linearly with model size, data duplication, and prompt length.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b3391e8a7b2018e1aefc71d1c567ae5da8efaa7a15ce3e56a2307429380c90e2"},"source":{"id":"2202.07646","kind":"arxiv","version":3},"verdict":{"id":"ff8e1c9c-dfb3-46f7-9ca6-b355cc20cdd3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-13T22:00:04.860007Z","strongest_claim":"We describe three log-linear relationships that quantify the degree to which LMs emit memorized training data. Memorization significantly grows as we increase (1) the capacity of a model, (2) the number of times an example has been duplicated, and (3) the number of tokens of context used to prompt the model.","one_line_summary":"Memorization in language models increases log-linearly with model capacity, data duplication count, and prompt context length.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That verbatim emission under the chosen prompting and matching criteria accurately captures the privacy, utility, and fairness harms, and that the log-linear trends will continue to hold at larger scales without additional confounding factors.","pith_extraction_headline":"Memorization in language models increases log-linearly with model size, data duplication, and prompt length."},"references":{"count":25,"sample":[{"doi":"","year":2016,"title":"Deep learning with differential privacy","work_id":"c00396ae-c1af-430f-9810-bb5aba9fa3ab","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Large-scale differen- tially private BERT","work_id":"42d46873-b36f-49fe-bd2b-e2fee8d02f56","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.5281/zenodo.5297715","year":null,"title":"GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow , March 2021","work_id":"6c7f8a44-6f52-448c-b819-5ba82a7bbc59","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2012,"title":"Extracting training data from large language models","work_id":"25b490c3-532b-40ec-b8ce-05fef84201d4","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":5,"cited_arxiv_id":"2107.03374","is_internal_anchor":true}],"resolved_work":25,"snapshot_sha256":"aa521d1fe9ae2871c69a3d7b4087d48b4af30f6193bcad5d56055bfced409441","internal_anchors":4},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ff8e1c9c-dfb3-46f7-9ca6-b355cc20cdd3"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T04:38:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"awVnw0ZX3CpfUjpSpvXGpc9Bve2HWGU3RZ9e11fggL6VBD/GNwY08P/Cp0TZlJfugQ6ikMfvzh3D6guXEOvcBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T07:38:53.769650Z"},"content_sha256":"afac010eeb4a6c77f21237c8de130b2147d8ba7286c097612f0d370921bf797b","schema_version":"1.0","event_id":"sha256:afac010eeb4a6c77f21237c8de130b2147d8ba7286c097612f0d370921bf797b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/X37K2SZTINTS73PEDCULW2JAAQ/bundle.json","state_url":"https://pith.science/pith/X37K2SZTINTS73PEDCULW2JAAQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/X37K2SZTINTS73PEDCULW2JAAQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T07:38:53Z","links":{"resolver":"https://pith.science/pith/X37K2SZTINTS73PEDCULW2JAAQ","bundle":"https://pith.science/pith/X37K2SZTINTS73PEDCULW2JAAQ/bundle.json","state":"https://pith.science/pith/X37K2SZTINTS73PEDCULW2JAAQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/X37K2SZTINTS73PEDCULW2JAAQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:X37K2SZTINTS73PEDCULW2JAAQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b5230ec6f01517894ba4f3fdb0c814e278571fab1daf3260f6f00ccb1142f847","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-02-15T18:48:31Z","title_canon_sha256":"b45340bfac6354acfb42d31e8b2975ff2334898c8cb111664e76c5a5e77fe631"},"schema_version":"1.0","source":{"id":"2202.07646","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2202.07646","created_at":"2026-05-18T04:38:57Z"},{"alias_kind":"arxiv_version","alias_value":"2202.07646v3","created_at":"2026-05-18T04:38:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2202.07646","created_at":"2026-05-18T04:38:57Z"},{"alias_kind":"pith_short_12","alias_value":"X37K2SZTINTS","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"X37K2SZTINTS73PE","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"X37K2SZT","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:afac010eeb4a6c77f21237c8de130b2147d8ba7286c097612f0d370921bf797b","target":"graph","created_at":"2026-05-18T04:38:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We describe three log-linear relationships that quantify the degree to which LMs emit memorized training data. Memorization significantly grows as we increase (1) the capacity of a model, (2) the number of times an example has been duplicated, and (3) the number of tokens of context used to prompt the model."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That verbatim emission under the chosen prompting and matching criteria accurately captures the privacy, utility, and fairness harms, and that the log-linear trends will continue to hold at larger scales without additional confounding factors."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Memorization in language models increases log-linearly with model capacity, data duplication count, and prompt context length."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Memorization in language models increases log-linearly with model size, data duplication, and prompt length."}],"snapshot_sha256":"b3391e8a7b2018e1aefc71d1c567ae5da8efaa7a15ce3e56a2307429380c90e2"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large language models (LMs) have been shown to memorize parts of their training data, and when prompted appropriately, they will emit the memorized training data verbatim. This is undesirable because memorization violates privacy (exposing user data), degrades utility (repeated easy-to-memorize text is often low quality), and hurts fairness (some texts are memorized over others).\n  We describe three log-linear relationships that quantify the degree to which LMs emit memorized training data. Memorization significantly grows as we increase (1) the capacity of a model, (2) the number of times an ","authors_text":"Chiyuan Zhang, Daphne Ippolito, Florian Tramer, Katherine Lee, Matthew Jagielski, Nicholas Carlini","cross_cats":["cs.CL"],"headline":"Memorization in language models increases log-linearly with model size, data duplication, and prompt length.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-02-15T18:48:31Z","title":"Quantifying Memorization Across Neural Language Models"},"references":{"count":25,"internal_anchors":4,"resolved_work":25,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Deep learning with differential privacy","work_id":"c00396ae-c1af-430f-9810-bb5aba9fa3ab","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Large-scale differen- tially private BERT","work_id":"42d46873-b36f-49fe-bd2b-e2fee8d02f56","year":null},{"cited_arxiv_id":"","doi":"10.5281/zenodo.5297715","is_internal_anchor":false,"ref_index":3,"title":"GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow , March 2021","work_id":"6c7f8a44-6f52-448c-b819-5ba82a7bbc59","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Extracting training data from large language models","work_id":"25b490c3-532b-40ec-b8ce-05fef84201d4","year":2012},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null}],"snapshot_sha256":"aa521d1fe9ae2871c69a3d7b4087d48b4af30f6193bcad5d56055bfced409441"},"source":{"id":"2202.07646","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-13T22:00:04.860007Z","id":"ff8e1c9c-dfb3-46f7-9ca6-b355cc20cdd3","model_set":{"reader":"grok-4.3"},"one_line_summary":"Memorization in language models increases log-linearly with model capacity, data duplication count, and prompt context length.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Memorization in language models increases log-linearly with model size, data duplication, and prompt length.","strongest_claim":"We describe three log-linear relationships that quantify the degree to which LMs emit memorized training data. Memorization significantly grows as we increase (1) the capacity of a model, (2) the number of times an example has been duplicated, and (3) the number of tokens of context used to prompt the model.","weakest_assumption":"That verbatim emission under the chosen prompting and matching criteria accurately captures the privacy, utility, and fairness harms, and that the log-linear trends will continue to hold at larger scales without additional confounding factors."}},"verdict_id":"ff8e1c9c-dfb3-46f7-9ca6-b355cc20cdd3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:fafd0df0647a8032935337f5af664e30d8e387aa19d0253fcbf841c84302d815","target":"record","created_at":"2026-05-18T04:38:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b5230ec6f01517894ba4f3fdb0c814e278571fab1daf3260f6f00ccb1142f847","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2022-02-15T18:48:31Z","title_canon_sha256":"b45340bfac6354acfb42d31e8b2975ff2334898c8cb111664e76c5a5e77fe631"},"schema_version":"1.0","source":{"id":"2202.07646","kind":"arxiv","version":3}},"canonical_sha256":"befead4b3343672fede418a8bb69200411c9639ef98b50f57d7ebcd975c9de30","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"befead4b3343672fede418a8bb69200411c9639ef98b50f57d7ebcd975c9de30","first_computed_at":"2026-05-18T04:38:57.963093Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T04:38:57.963093Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ctVAtXcr916BzpCQL8WGyAV+fR5HT0AYcl0vMiiwa8Ax2j/rx2FC+37LWsNX/w3Q5AviQ/PieHjynjgSG6svAA==","signature_status":"signed_v1","signed_at":"2026-05-18T04:38:57.963852Z","signed_message":"canonical_sha256_bytes"},"source_id":"2202.07646","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:fafd0df0647a8032935337f5af664e30d8e387aa19d0253fcbf841c84302d815","sha256:afac010eeb4a6c77f21237c8de130b2147d8ba7286c097612f0d370921bf797b"],"state_sha256":"783515e13e1cafc58c53031a5bb97220edc15ec2ce8fa5ea3b31d1ed98606db7"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+bUf4KfR9D+PypYWvpoGKzQUDB1xSqopraJW1QtzmixtAfUhBmkPJPG/tlv0kAjaxQisewDRsvf8vvCGbKdgAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T07:38:53.773165Z","bundle_sha256":"58be3bcdd10d03a55edf689514e2ecd710e547dd03230e8dd32aa73cfe427c19"}}