{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:HIFDB5PP5JG3ELQ33S46WXFZY7","short_pith_number":"pith:HIFDB5PP","schema_version":"1.0","canonical_sha256":"3a0a30f5efea4db22e1bdcb9eb5cb9c7dc4cd8ff841c869ba5c5d095262e92af","source":{"kind":"arxiv","id":"1603.07410","version":4},"attestation_state":"computed","paper":{"title":"LSH Ensemble: Internet-Scale Domain Search","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Erkang Zhu, Fatemeh Nargesian, Ken Q. Pu, Ren\\'ee J. Miller","submitted_at":"2016-03-24T01:43:28Z","abstract_excerpt":"We study the problem of domain search where a domain is a set of distinct values from an unspecified universe. We use Jaccard set containment, defined as $|Q \\cap X|/|Q|$, as the relevance measure of a domain $X$ to a query domain $Q$. Our choice of Jaccard set containment over Jaccard similarity makes our work particularly suitable for searching Open Data and data on the web, as Jaccard similarity is known to have poor performance over sets with large differences in their domain sizes. We demonstrate that the domains found in several real-life Open Data and web data repositories show a power-"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1603.07410","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DB","submitted_at":"2016-03-24T01:43:28Z","cross_cats_sorted":[],"title_canon_sha256":"24046ec4fcf6483ee95c8b7a51f5cec3ae85d46610ae94a1533c9c05bb217baa","abstract_canon_sha256":"74544f31baf30b290dcc5a54bf5a0e8a14c06f4f78e79b070649c6510a0470e6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:10:37.811414Z","signature_b64":"EvKr9X83NM0KL5VZxquDxviTEWgPJKWfjcTmn0Pf9JRAzN4bzoTYnMwlYHmODVwH8wJAPDM2Nvo2uoUi1kbjDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3a0a30f5efea4db22e1bdcb9eb5cb9c7dc4cd8ff841c869ba5c5d095262e92af","last_reissued_at":"2026-05-18T01:10:37.810920Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:10:37.810920Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LSH Ensemble: Internet-Scale Domain Search","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Erkang Zhu, Fatemeh Nargesian, Ken Q. Pu, Ren\\'ee J. Miller","submitted_at":"2016-03-24T01:43:28Z","abstract_excerpt":"We study the problem of domain search where a domain is a set of distinct values from an unspecified universe. We use Jaccard set containment, defined as $|Q \\cap X|/|Q|$, as the relevance measure of a domain $X$ to a query domain $Q$. Our choice of Jaccard set containment over Jaccard similarity makes our work particularly suitable for searching Open Data and data on the web, as Jaccard similarity is known to have poor performance over sets with large differences in their domain sizes. We demonstrate that the domains found in several real-life Open Data and web data repositories show a power-"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1603.07410","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1603.07410","created_at":"2026-05-18T01:10:37.810990+00:00"},{"alias_kind":"arxiv_version","alias_value":"1603.07410v4","created_at":"2026-05-18T01:10:37.810990+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1603.07410","created_at":"2026-05-18T01:10:37.810990+00:00"},{"alias_kind":"pith_short_12","alias_value":"HIFDB5PP5JG3","created_at":"2026-05-18T12:30:19.053100+00:00"},{"alias_kind":"pith_short_16","alias_value":"HIFDB5PP5JG3ELQ3","created_at":"2026-05-18T12:30:19.053100+00:00"},{"alias_kind":"pith_short_8","alias_value":"HIFDB5PP","created_at":"2026-05-18T12:30:19.053100+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.22766","citing_title":"Diversed Model Discovery via Structured Table Discovery","ref_index":72,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7","json":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7.json","graph_json":"https://pith.science/api/pith-number/HIFDB5PP5JG3ELQ33S46WXFZY7/graph.json","events_json":"https://pith.science/api/pith-number/HIFDB5PP5JG3ELQ33S46WXFZY7/events.json","paper":"https://pith.science/paper/HIFDB5PP"},"agent_actions":{"view_html":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7","download_json":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7.json","view_paper":"https://pith.science/paper/HIFDB5PP","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1603.07410&json=true","fetch_graph":"https://pith.science/api/pith-number/HIFDB5PP5JG3ELQ33S46WXFZY7/graph.json","fetch_events":"https://pith.science/api/pith-number/HIFDB5PP5JG3ELQ33S46WXFZY7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7/action/storage_attestation","attest_author":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7/action/author_attestation","sign_citation":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7/action/citation_signature","submit_replication":"https://pith.science/pith/HIFDB5PP5JG3ELQ33S46WXFZY7/action/replication_record"}},"created_at":"2026-05-18T01:10:37.810990+00:00","updated_at":"2026-05-18T01:10:37.810990+00:00"}