{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:33NV57HYMIFM5GWDBTWSEYIN2F","short_pith_number":"pith:33NV57HY","canonical_record":{"source":{"id":"2311.17035","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-11-28T18:47:03Z","cross_cats_sorted":["cs.CL","cs.CR"],"title_canon_sha256":"b92f16cf18c2856205cecdb2cb789e5f9b1896bee9511d819789558b5381838d","abstract_canon_sha256":"78c268ca3f6d957e3e6106181db95edd8ea82003d47c3b317c03666251909969"},"schema_version":"1.0"},"canonical_sha256":"dedb5efcf8620ace9ac30ced22610dd1616a0f2592cb05ab0854df3c2d44b3c6","source":{"kind":"arxiv","id":"2311.17035","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2311.17035","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2311.17035v1","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.17035","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"33NV57HYMIFM","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"33NV57HYMIFM5GWD","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"33NV57HY","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:33NV57HYMIFM5GWDBTWSEYIN2F","target":"record","payload":{"canonical_record":{"source":{"id":"2311.17035","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-11-28T18:47:03Z","cross_cats_sorted":["cs.CL","cs.CR"],"title_canon_sha256":"b92f16cf18c2856205cecdb2cb789e5f9b1896bee9511d819789558b5381838d","abstract_canon_sha256":"78c268ca3f6d957e3e6106181db95edd8ea82003d47c3b317c03666251909969"},"schema_version":"1.0"},"canonical_sha256":"dedb5efcf8620ace9ac30ced22610dd1616a0f2592cb05ab0854df3c2d44b3c6","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.501823Z","signature_b64":"/npr72BKYo24VhkXrN1UhZrL8NbOkovXJqZSYiIpR/UV+NMxliW8mA9PkqwRWkCHXkIzpwnIGW9/TJaZbpftAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dedb5efcf8620ace9ac30ced22610dd1616a0f2592cb05ab0854df3c2d44b3c6","last_reissued_at":"2026-05-17T23:38:50.501353Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.501353Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2311.17035","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ZE1QwH+DflX/lVGV6qd36Cn8r0H0ZaKXO0VoHzOB8wgTvh4xgkfnd5an5VtW3wVICP+dxRX69d/ZcXMWXHx9Dg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T22:29:11.654481Z"},"content_sha256":"fb1f3c2da9a7f9f86e67625d2e65010c51fb10ef471b14d6f2f6021c714ad8ef","schema_version":"1.0","event_id":"sha256:fb1f3c2da9a7f9f86e67625d2e65010c51fb10ef471b14d6f2f6021c714ad8ef"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:33NV57HYMIFM5GWDBTWSEYIN2F","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Scalable Extraction of Training Data from (Production) Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Adversaries can extract gigabytes of training data from language models including ChatGPT by querying them without prior knowledge of the data.","cross_cats":["cs.CL","cs.CR"],"primary_cat":"cs.LG","authors_text":"A. Feder Cooper, Christopher A. Choquette-Choo, Daphne Ippolito, Eric Wallace, Florian Tram\\`er, Jonathan Hayase, Katherine Lee, Matthew Jagielski, Milad Nasr, Nicholas Carlini","submitted_at":"2023-11-28T18:47:03Z","abstract_excerpt":"This paper studies extractable memorization: training data that an adversary can efficiently extract by querying a machine learning model without prior knowledge of the training dataset. We show an adversary can extract gigabytes of training data from open-source language models like Pythia or GPT-Neo, semi-open models like LLaMA or Falcon, and closed models like ChatGPT. Existing techniques from the literature suffice to attack unaligned models; in order to attack the aligned ChatGPT, we develop a new divergence attack that causes the model to diverge from its chatbot-style generations and em"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our methods show practical attacks can recover far more data than previously thought, and reveal that current alignment techniques do not eliminate memorization.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the strings emitted by the models are verifiably present in the original training datasets rather than plausible generations, and that the divergence attack requires no prior knowledge of the training data.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Adversaries can scalably extract gigabytes of training data from open, semi-open, and closed language models via querying attacks, including a divergence method that increases extraction rates 150x on aligned models like ChatGPT.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Adversaries can extract gigabytes of training data from language models including ChatGPT by querying them without prior knowledge of the data.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"48ef0d7e0dee3ffea16d9c21860e9fffcd3b41cac47e054ce94eec5809dae039"},"source":{"id":"2311.17035","kind":"arxiv","version":1},"verdict":{"id":"8d2af25f-7bc9-44be-8b60-de34beec6922","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T18:56:53.144454Z","strongest_claim":"Our methods show practical attacks can recover far more data than previously thought, and reveal that current alignment techniques do not eliminate memorization.","one_line_summary":"Adversaries can scalably extract gigabytes of training data from open, semi-open, and closed language models via querying attacks, including a divergence method that increases extraction rates 150x on aligned models like ChatGPT.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the strings emitted by the models are verifiably present in the original training datasets rather than plausible generations, and that the divergence attack requires no prior knowledge of the training data.","pith_extraction_headline":"Adversaries can extract gigabytes of training data from language models including ChatGPT by querying them without prior knowledge of the data."},"references":{"count":64,"sample":[{"doi":"","year":null,"title":"Sequential Good-Turing and the miss- ing species problem","work_id":"112d6175-296d-4c8d-84a9-d1d13c2fda86","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"M., F IRAT, O., ET AL","work_id":"54530589-394d-4124-899c-d420caebff29","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","ref_index":3,"cited_arxiv_id":"2204.05862","is_internal_anchor":true},{"doi":"","year":2022,"title":"Recon- structing training data with informed adversaries","work_id":"7050bd3e-e69c-4319-ad19-b233440b2051","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"A., P UROHIT , S., P RASHANTH , U","work_id":"df0775cf-cb9c-4447-80bd-c56c1598d731","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":64,"snapshot_sha256":"7a35e5421e9cd5535705fc85ce79224627a32da4d29cad60020d418f84950d4e","internal_anchors":5},"formal_canon":{"evidence_count":1,"snapshot_sha256":"739684a15a5a08e0c264aa4e7dd31b80c77e84fc23719c532cb7b031c76c2316"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"8d2af25f-7bc9-44be-8b60-de34beec6922"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dQMJYKHhztJBhMBIGFgrXg6x4+WMK8Zh7IShIxugjpWlmm1Fo0E7G+0Nh8RTeMK5CrsZa/ayErtYlPxH/QbFBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T22:29:11.655572Z"},"content_sha256":"5a4827a60b82d35e17b76b6cfa61ccf7d84d57fd044c1b84c57f14cafa29b56b","schema_version":"1.0","event_id":"sha256:5a4827a60b82d35e17b76b6cfa61ccf7d84d57fd044c1b84c57f14cafa29b56b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/33NV57HYMIFM5GWDBTWSEYIN2F/bundle.json","state_url":"https://pith.science/pith/33NV57HYMIFM5GWDBTWSEYIN2F/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/33NV57HYMIFM5GWDBTWSEYIN2F/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T22:29:11Z","links":{"resolver":"https://pith.science/pith/33NV57HYMIFM5GWDBTWSEYIN2F","bundle":"https://pith.science/pith/33NV57HYMIFM5GWDBTWSEYIN2F/bundle.json","state":"https://pith.science/pith/33NV57HYMIFM5GWDBTWSEYIN2F/state.json","well_known_bundle":"https://pith.science/.well-known/pith/33NV57HYMIFM5GWDBTWSEYIN2F/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:33NV57HYMIFM5GWDBTWSEYIN2F","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"78c268ca3f6d957e3e6106181db95edd8ea82003d47c3b317c03666251909969","cross_cats_sorted":["cs.CL","cs.CR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-11-28T18:47:03Z","title_canon_sha256":"b92f16cf18c2856205cecdb2cb789e5f9b1896bee9511d819789558b5381838d"},"schema_version":"1.0","source":{"id":"2311.17035","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2311.17035","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2311.17035v1","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.17035","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"33NV57HYMIFM","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"33NV57HYMIFM5GWD","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"33NV57HY","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:5a4827a60b82d35e17b76b6cfa61ccf7d84d57fd044c1b84c57f14cafa29b56b","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our methods show practical attacks can recover far more data than previously thought, and reveal that current alignment techniques do not eliminate memorization."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the strings emitted by the models are verifiably present in the original training datasets rather than plausible generations, and that the divergence attack requires no prior knowledge of the training data."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Adversaries can scalably extract gigabytes of training data from open, semi-open, and closed language models via querying attacks, including a divergence method that increases extraction rates 150x on aligned models like ChatGPT."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Adversaries can extract gigabytes of training data from language models including ChatGPT by querying them without prior knowledge of the data."}],"snapshot_sha256":"48ef0d7e0dee3ffea16d9c21860e9fffcd3b41cac47e054ce94eec5809dae039"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"739684a15a5a08e0c264aa4e7dd31b80c77e84fc23719c532cb7b031c76c2316"},"paper":{"abstract_excerpt":"This paper studies extractable memorization: training data that an adversary can efficiently extract by querying a machine learning model without prior knowledge of the training dataset. We show an adversary can extract gigabytes of training data from open-source language models like Pythia or GPT-Neo, semi-open models like LLaMA or Falcon, and closed models like ChatGPT. Existing techniques from the literature suffice to attack unaligned models; in order to attack the aligned ChatGPT, we develop a new divergence attack that causes the model to diverge from its chatbot-style generations and em","authors_text":"A. Feder Cooper, Christopher A. Choquette-Choo, Daphne Ippolito, Eric Wallace, Florian Tram\\`er, Jonathan Hayase, Katherine Lee, Matthew Jagielski, Milad Nasr, Nicholas Carlini","cross_cats":["cs.CL","cs.CR"],"headline":"Adversaries can extract gigabytes of training data from language models including ChatGPT by querying them without prior knowledge of the data.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-11-28T18:47:03Z","title":"Scalable Extraction of Training Data from (Production) Language Models"},"references":{"count":64,"internal_anchors":5,"resolved_work":64,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Sequential Good-Turing and the miss- ing species problem","work_id":"112d6175-296d-4c8d-84a9-d1d13c2fda86","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"M., F IRAT, O., ET AL","work_id":"54530589-394d-4124-899c-d420caebff29","year":2023},{"cited_arxiv_id":"2204.05862","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Recon- structing training data with informed adversaries","work_id":"7050bd3e-e69c-4319-ad19-b233440b2051","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"A., P UROHIT , S., P RASHANTH , U","work_id":"df0775cf-cb9c-4447-80bd-c56c1598d731","year":2023}],"snapshot_sha256":"7a35e5421e9cd5535705fc85ce79224627a32da4d29cad60020d418f84950d4e"},"source":{"id":"2311.17035","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T18:56:53.144454Z","id":"8d2af25f-7bc9-44be-8b60-de34beec6922","model_set":{"reader":"grok-4.3"},"one_line_summary":"Adversaries can scalably extract gigabytes of training data from open, semi-open, and closed language models via querying attacks, including a divergence method that increases extraction rates 150x on aligned models like ChatGPT.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Adversaries can extract gigabytes of training data from language models including ChatGPT by querying them without prior knowledge of the data.","strongest_claim":"Our methods show practical attacks can recover far more data than previously thought, and reveal that current alignment techniques do not eliminate memorization.","weakest_assumption":"That the strings emitted by the models are verifiably present in the original training datasets rather than plausible generations, and that the divergence attack requires no prior knowledge of the training data."}},"verdict_id":"8d2af25f-7bc9-44be-8b60-de34beec6922"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:fb1f3c2da9a7f9f86e67625d2e65010c51fb10ef471b14d6f2f6021c714ad8ef","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"78c268ca3f6d957e3e6106181db95edd8ea82003d47c3b317c03666251909969","cross_cats_sorted":["cs.CL","cs.CR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-11-28T18:47:03Z","title_canon_sha256":"b92f16cf18c2856205cecdb2cb789e5f9b1896bee9511d819789558b5381838d"},"schema_version":"1.0","source":{"id":"2311.17035","kind":"arxiv","version":1}},"canonical_sha256":"dedb5efcf8620ace9ac30ced22610dd1616a0f2592cb05ab0854df3c2d44b3c6","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"dedb5efcf8620ace9ac30ced22610dd1616a0f2592cb05ab0854df3c2d44b3c6","first_computed_at":"2026-05-17T23:38:50.501353Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.501353Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/npr72BKYo24VhkXrN1UhZrL8NbOkovXJqZSYiIpR/UV+NMxliW8mA9PkqwRWkCHXkIzpwnIGW9/TJaZbpftAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.501823Z","signed_message":"canonical_sha256_bytes"},"source_id":"2311.17035","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:fb1f3c2da9a7f9f86e67625d2e65010c51fb10ef471b14d6f2f6021c714ad8ef","sha256:5a4827a60b82d35e17b76b6cfa61ccf7d84d57fd044c1b84c57f14cafa29b56b"],"state_sha256":"7a5f33df430ea22b3bdfedc36af8fee0346f25648cb1f913c9841bf79758ff6d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SgpuryUnc0zb2lwi5TIuXs95YoeNOinKRTk3WfaldCC8hyAbrkX2J6K3X/a/LTLxIoTGI5m4qgVoLPpfFLPpAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T22:29:11.659382Z","bundle_sha256":"a958f25433bdcf852581c67da430637b9a5c6319cb5dcc1953ae9af0d311bea6"}}