{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:O6FNK2VNMHKVLLGYFOPVUF6VGL","short_pith_number":"pith:O6FNK2VN","schema_version":"1.0","canonical_sha256":"778ad56aad61d555acd82b9f5a17d532eb3abffd7a1bbdf9d1edb37912796d1e","source":{"kind":"arxiv","id":"2303.09540","version":3},"attestation_state":"computed","paper":{"title":"SemDeDup: Data-efficient learning at web-scale through semantic deduplication","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.LG","authors_text":"Amro Abbas, Ari S. Morcos, D\\'aniel Simig, Kushal Tirumala, Surya Ganguli","submitted_at":"2023-03-16T17:53:24Z","abstract_excerpt":"Progress in machine learning has been driven in large part by massive increases in data. However, large web-scale datasets such as LAION are largely uncurated beyond searches for exact duplicates, potentially leaving much redundancy. Here, we introduce SemDeDup, a method which leverages embeddings from pre-trained models to identify and remove semantic duplicates: data pairs which are semantically similar, but not exactly identical. Removing semantic duplicates preserves performance and speeds up learning. Analyzing a subset of LAION, we show that SemDeDup can remove 50% of the data with minim"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2303.09540","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.LG","submitted_at":"2023-03-16T17:53:24Z","cross_cats_sorted":["cs.AI","cs.CV"],"title_canon_sha256":"0ed5dddaabe391706c3b85fa3d85c958f7f436db834195e6cc1b6b279d95d504","abstract_canon_sha256":"7cec0d4952d6a84527c52df0fb9342f92fb46237525aeef199af03ff5095044a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:38:34.944125Z","signature_b64":"/62vzBQ8OTMoQiVQheAuz+QTlE3bnrLsqHY04owsrV0SYvkZkwJI2oWWd2b1UliOE9bULKee1yBHfZVdegicCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"778ad56aad61d555acd82b9f5a17d532eb3abffd7a1bbdf9d1edb37912796d1e","last_reissued_at":"2026-05-18T02:38:34.943495Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:38:34.943495Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SemDeDup: Data-efficient learning at web-scale through semantic deduplication","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.LG","authors_text":"Amro Abbas, Ari S. Morcos, D\\'aniel Simig, Kushal Tirumala, Surya Ganguli","submitted_at":"2023-03-16T17:53:24Z","abstract_excerpt":"Progress in machine learning has been driven in large part by massive increases in data. However, large web-scale datasets such as LAION are largely uncurated beyond searches for exact duplicates, potentially leaving much redundancy. Here, we introduce SemDeDup, a method which leverages embeddings from pre-trained models to identify and remove semantic duplicates: data pairs which are semantically similar, but not exactly identical. Removing semantic duplicates preserves performance and speeds up learning. Analyzing a subset of LAION, we show that SemDeDup can remove 50% of the data with minim"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2303.09540","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2303.09540","created_at":"2026-05-18T02:38:34.943600+00:00"},{"alias_kind":"arxiv_version","alias_value":"2303.09540v3","created_at":"2026-05-18T02:38:34.943600+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.09540","created_at":"2026-05-18T02:38:34.943600+00:00"},{"alias_kind":"pith_short_12","alias_value":"O6FNK2VNMHKV","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"O6FNK2VNMHKVLLGY","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"O6FNK2VN","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":23,"internal_anchor_count":23,"sample":[{"citing_arxiv_id":"2412.04468","citing_title":"NVILA: Efficient Frontier Visual Language Models","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2504.21850","citing_title":"Visual Compositional Tuning","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15691","citing_title":"SEED: Targeted Data Selection by Weighted Independent Set","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17003","citing_title":"Learning-Zone Energy: Online Data Selection for Efficient RL Post-Training","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17003","citing_title":"Learning-Zone Energy: Online Data Selection for Efficient RL Post-Training","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2509.07177","citing_title":"Towards EnergyGPT: A Large Language Model Specialized for the Energy Sector","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2406.11794","citing_title":"DataComp-LM: In search of the next generation of training sets for language models","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2309.16671","citing_title":"Demystifying CLIP Data","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11405","citing_title":"20/20 Vision Language Models: A Prescription for Better VLMs through Data Curation Alone","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12944","citing_title":"From Instance Selection to Fixed-Pool Data Recipe Search for Supervised Fine-Tuning","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02852","citing_title":"Dependency-Guided Repository-Level C-to-Rust Translation with Reinforcement Alignment","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11405","citing_title":"20/20 Vision Language Models: A Prescription for Better VLMs through Data Curation Alone","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2311.05232","citing_title":"A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27932","citing_title":"Dynamic Cluster Data Sampling for Efficient and Long-Tail-Aware Vision-Language Pre-training","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09990","citing_title":"Merlin: Deterministic Byte-Exact Deduplication for Lossless Context Optimization in Large Language Model Inference","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09611","citing_title":"Byte-Exact Deduplication in Retrieval-Augmented Generation: A Three-Regime Empirical Analysis Across Public Benchmarks","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24762","citing_title":"OmniShotCut: Holistic Relational Shot Boundary Detection with Shot-Query Transformer","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05807","citing_title":"LCC-LLM: Leveraging Code-Centric Large Language Models for Malware Attribution","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2508.06471","citing_title":"GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08366","citing_title":"Scaling-Aware Data Selection for End-to-End Autonomous Driving Systems","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2501.03575","citing_title":"Cosmos World Foundation Model Platform for Physical AI","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16197","citing_title":"Sketching the Readout of Large Language Models for Scalable Data Attribution and Valuation","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02757","citing_title":"Seeing Realism from Simulation: Efficient Video Transfer for Vision-Language-Action Data Augmentation","ref_index":1,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL","json":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL.json","graph_json":"https://pith.science/api/pith-number/O6FNK2VNMHKVLLGYFOPVUF6VGL/graph.json","events_json":"https://pith.science/api/pith-number/O6FNK2VNMHKVLLGYFOPVUF6VGL/events.json","paper":"https://pith.science/paper/O6FNK2VN"},"agent_actions":{"view_html":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL","download_json":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL.json","view_paper":"https://pith.science/paper/O6FNK2VN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2303.09540&json=true","fetch_graph":"https://pith.science/api/pith-number/O6FNK2VNMHKVLLGYFOPVUF6VGL/graph.json","fetch_events":"https://pith.science/api/pith-number/O6FNK2VNMHKVLLGYFOPVUF6VGL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL/action/storage_attestation","attest_author":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL/action/author_attestation","sign_citation":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL/action/citation_signature","submit_replication":"https://pith.science/pith/O6FNK2VNMHKVLLGYFOPVUF6VGL/action/replication_record"}},"created_at":"2026-05-18T02:38:34.943600+00:00","updated_at":"2026-05-18T02:38:34.943600+00:00"}