{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:NYXNHGUVFGFQWDGZA634KRVKLZ","short_pith_number":"pith:NYXNHGUV","schema_version":"1.0","canonical_sha256":"6e2ed39a95298b0b0cd907b7c546aa5e66c01dac773e0a7b74e3428242a929ba","source":{"kind":"arxiv","id":"2606.31983","version":1},"attestation_state":"computed","paper":{"title":"Clean Me If You Can: A Large Collection of Real-World Addresses for Data Cleaning Benchmarking","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Fatemeh Ahmadi, Luca Zecchini, Mohamed Abdelmaksoud, Tilmann Rabl, Tobias Bernhard, Ziawasch Abedjan","submitted_at":"2026-06-30T17:21:48Z","abstract_excerpt":"There has been extensive research on automating and scaling data cleaning, i.e., the detection and correction of erroneous values in tabular data. Yet, existing approaches often perform well only within controlled environments. One of the major bottlenecks in data cleaning research is the lack of real-world datasets. In this paper, we address this gap by providing a large, dirty dataset with postal entries and their corresponding ground truth. We discuss the design decisions and challenges for obtaining the dataset. We demonstrate the limitations of existing cleaning approaches when faced with"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.31983","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.DB","submitted_at":"2026-06-30T17:21:48Z","cross_cats_sorted":[],"title_canon_sha256":"5b119616af8d83081f89266bdcb9352a2b1b266ea19325a0f1c582c9d6f3f3c5","abstract_canon_sha256":"85880e7f2e23f06c6a9c6413e1110c66c3bae5e1ba403b3a3e43411bba39d541"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-01T01:18:27.820948Z","signature_b64":"BcfpHkLS87HZJGrKoRInWSNH9cq96EbuS3pbQ/W30/Lh6UOVx4erzuRsofEAcgGaNv3KKkVh0pXMnnuNo8y2Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6e2ed39a95298b0b0cd907b7c546aa5e66c01dac773e0a7b74e3428242a929ba","last_reissued_at":"2026-07-01T01:18:27.820400Z","signature_status":"signed_v1","first_computed_at":"2026-07-01T01:18:27.820400Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Clean Me If You Can: A Large Collection of Real-World Addresses for Data Cleaning Benchmarking","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Fatemeh Ahmadi, Luca Zecchini, Mohamed Abdelmaksoud, Tilmann Rabl, Tobias Bernhard, Ziawasch Abedjan","submitted_at":"2026-06-30T17:21:48Z","abstract_excerpt":"There has been extensive research on automating and scaling data cleaning, i.e., the detection and correction of erroneous values in tabular data. Yet, existing approaches often perform well only within controlled environments. One of the major bottlenecks in data cleaning research is the lack of real-world datasets. In this paper, we address this gap by providing a large, dirty dataset with postal entries and their corresponding ground truth. We discuss the design decisions and challenges for obtaining the dataset. We demonstrate the limitations of existing cleaning approaches when faced with"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.31983","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.31983/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.31983","created_at":"2026-07-01T01:18:27.820479+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.31983v1","created_at":"2026-07-01T01:18:27.820479+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.31983","created_at":"2026-07-01T01:18:27.820479+00:00"},{"alias_kind":"pith_short_12","alias_value":"NYXNHGUVFGFQ","created_at":"2026-07-01T01:18:27.820479+00:00"},{"alias_kind":"pith_short_16","alias_value":"NYXNHGUVFGFQWDGZ","created_at":"2026-07-01T01:18:27.820479+00:00"},{"alias_kind":"pith_short_8","alias_value":"NYXNHGUV","created_at":"2026-07-01T01:18:27.820479+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ","json":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ.json","graph_json":"https://pith.science/api/pith-number/NYXNHGUVFGFQWDGZA634KRVKLZ/graph.json","events_json":"https://pith.science/api/pith-number/NYXNHGUVFGFQWDGZA634KRVKLZ/events.json","paper":"https://pith.science/paper/NYXNHGUV"},"agent_actions":{"view_html":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ","download_json":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ.json","view_paper":"https://pith.science/paper/NYXNHGUV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.31983&json=true","fetch_graph":"https://pith.science/api/pith-number/NYXNHGUVFGFQWDGZA634KRVKLZ/graph.json","fetch_events":"https://pith.science/api/pith-number/NYXNHGUVFGFQWDGZA634KRVKLZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ/action/storage_attestation","attest_author":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ/action/author_attestation","sign_citation":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ/action/citation_signature","submit_replication":"https://pith.science/pith/NYXNHGUVFGFQWDGZA634KRVKLZ/action/replication_record"}},"created_at":"2026-07-01T01:18:27.820479+00:00","updated_at":"2026-07-01T01:18:27.820479+00:00"}