{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2010:PAPA2VAOQPOZBWALDRHEPG5OCB","short_pith_number":"pith:PAPA2VAO","schema_version":"1.0","canonical_sha256":"781e0d540e83dd90d80b1c4e479bae104b86702beecf22530b64f5786296fe63","source":{"kind":"arxiv","id":"1005.4298","version":1},"attestation_state":"computed","paper":{"title":"Distantly Labeling Data for Large Scale Cross-Document Coreference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.IR","cs.LG"],"primary_cat":"cs.AI","authors_text":"Andrew McCallum, Michael Wick, Sameer Singh","submitted_at":"2010-05-24T10:35:50Z","abstract_excerpt":"Cross-document coreference, the problem of resolving entity mentions across multi-document collections, is crucial to automated knowledge base construction and data mining tasks. However, the scarcity of large labeled data sets has hindered supervised machine learning research for this task. In this paper we develop and demonstrate an approach based on ``distantly-labeling'' a data set from which we can train a discriminative cross-document coreference model. In particular we build a dataset of more than a million people mentions extracted from 3.5 years of New York Times articles, leverage Wi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1005.4298","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2010-05-24T10:35:50Z","cross_cats_sorted":["cs.IR","cs.LG"],"title_canon_sha256":"35e1bb20bd8b0f835ffdb83d7e491b09a28bece17b04255319d210dbec0a95fe","abstract_canon_sha256":"5da1757df5b6442d74b1c9772dff4c94ca2124d2f732098ec753b73aa49a0ff0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:23:56.231007Z","signature_b64":"MD19Xgq1j+fozR8iwucLlM/zyF5vChrIgsaJHvA9k9gqU7LwnNY4Rk3Er20BzjbhVtSNpiGhVhnOlVw3HxJCDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"781e0d540e83dd90d80b1c4e479bae104b86702beecf22530b64f5786296fe63","last_reissued_at":"2026-05-18T02:23:56.230329Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:23:56.230329Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Distantly Labeling Data for Large Scale Cross-Document Coreference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.IR","cs.LG"],"primary_cat":"cs.AI","authors_text":"Andrew McCallum, Michael Wick, Sameer Singh","submitted_at":"2010-05-24T10:35:50Z","abstract_excerpt":"Cross-document coreference, the problem of resolving entity mentions across multi-document collections, is crucial to automated knowledge base construction and data mining tasks. However, the scarcity of large labeled data sets has hindered supervised machine learning research for this task. In this paper we develop and demonstrate an approach based on ``distantly-labeling'' a data set from which we can train a discriminative cross-document coreference model. In particular we build a dataset of more than a million people mentions extracted from 3.5 years of New York Times articles, leverage Wi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1005.4298","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1005.4298","created_at":"2026-05-18T02:23:56.230433+00:00"},{"alias_kind":"arxiv_version","alias_value":"1005.4298v1","created_at":"2026-05-18T02:23:56.230433+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1005.4298","created_at":"2026-05-18T02:23:56.230433+00:00"},{"alias_kind":"pith_short_12","alias_value":"PAPA2VAOQPOZ","created_at":"2026-05-18T12:26:12.377268+00:00"},{"alias_kind":"pith_short_16","alias_value":"PAPA2VAOQPOZBWAL","created_at":"2026-05-18T12:26:12.377268+00:00"},{"alias_kind":"pith_short_8","alias_value":"PAPA2VAO","created_at":"2026-05-18T12:26:12.377268+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB","json":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB.json","graph_json":"https://pith.science/api/pith-number/PAPA2VAOQPOZBWALDRHEPG5OCB/graph.json","events_json":"https://pith.science/api/pith-number/PAPA2VAOQPOZBWALDRHEPG5OCB/events.json","paper":"https://pith.science/paper/PAPA2VAO"},"agent_actions":{"view_html":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB","download_json":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB.json","view_paper":"https://pith.science/paper/PAPA2VAO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1005.4298&json=true","fetch_graph":"https://pith.science/api/pith-number/PAPA2VAOQPOZBWALDRHEPG5OCB/graph.json","fetch_events":"https://pith.science/api/pith-number/PAPA2VAOQPOZBWALDRHEPG5OCB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB/action/storage_attestation","attest_author":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB/action/author_attestation","sign_citation":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB/action/citation_signature","submit_replication":"https://pith.science/pith/PAPA2VAOQPOZBWALDRHEPG5OCB/action/replication_record"}},"created_at":"2026-05-18T02:23:56.230433+00:00","updated_at":"2026-05-18T02:23:56.230433+00:00"}