{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ROUR7ERENO2XTXMWINLRLJNL4D","short_pith_number":"pith:ROUR7ERE","schema_version":"1.0","canonical_sha256":"8ba91f92246bb579dd96435715a5abe0dcd082c0d3e4583388f5c4b9fda8a230","source":{"kind":"arxiv","id":"2606.28823","version":1},"attestation_state":"computed","paper":{"title":"Labeling Training Data for Entity Matching Using Large Language Models","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Aaron Steiner, Christian Bizer","submitted_at":"2026-06-27T09:15:09Z","abstract_excerpt":"Recent large language models (LLMs) achieve strong performance on entity matching without requiring task-specific training data. However, applying these models to large sets of candidate pairs remains slow and costly. In contrast, entity matchers using traditional machine learning methods or small language models (SLMs), such as RoBERTa, offer much faster inference but require task-specific training data.\n  This paper investigates whether the need to provide task-specific training data can be avoided by using knowledge-distillation workflows, in which an LLM serves as a teacher model to label "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.28823","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-27T09:15:09Z","cross_cats_sorted":[],"title_canon_sha256":"6fc42e2c1def6f107d9b51fee7de4ff9b240127a0a5237457f2f4cbc9cc0dee0","abstract_canon_sha256":"3edeb68edce58bc3876b4524cc7ca41bcc405981d0f2723e0a444a0ed6db89dd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T01:16:53.307816Z","signature_b64":"EP8XnRRM1wc04a/EseUQh0I2uvm/0jA9Le/gM7xWo2rmQmdQcn6M7BuYsuV9LN1dKs7Ez7rASweOb/RbeGAtDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8ba91f92246bb579dd96435715a5abe0dcd082c0d3e4583388f5c4b9fda8a230","last_reissued_at":"2026-06-30T01:16:53.307138Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T01:16:53.307138Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Labeling Training Data for Entity Matching Using Large Language Models","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Aaron Steiner, Christian Bizer","submitted_at":"2026-06-27T09:15:09Z","abstract_excerpt":"Recent large language models (LLMs) achieve strong performance on entity matching without requiring task-specific training data. However, applying these models to large sets of candidate pairs remains slow and costly. In contrast, entity matchers using traditional machine learning methods or small language models (SLMs), such as RoBERTa, offer much faster inference but require task-specific training data.\n  This paper investigates whether the need to provide task-specific training data can be avoided by using knowledge-distillation workflows, in which an LLM serves as a teacher model to label "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.28823","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.28823/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.28823","created_at":"2026-06-30T01:16:53.307225+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.28823v1","created_at":"2026-06-30T01:16:53.307225+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.28823","created_at":"2026-06-30T01:16:53.307225+00:00"},{"alias_kind":"pith_short_12","alias_value":"ROUR7ERENO2X","created_at":"2026-06-30T01:16:53.307225+00:00"},{"alias_kind":"pith_short_16","alias_value":"ROUR7ERENO2XTXMW","created_at":"2026-06-30T01:16:53.307225+00:00"},{"alias_kind":"pith_short_8","alias_value":"ROUR7ERE","created_at":"2026-06-30T01:16:53.307225+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D","json":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D.json","graph_json":"https://pith.science/api/pith-number/ROUR7ERENO2XTXMWINLRLJNL4D/graph.json","events_json":"https://pith.science/api/pith-number/ROUR7ERENO2XTXMWINLRLJNL4D/events.json","paper":"https://pith.science/paper/ROUR7ERE"},"agent_actions":{"view_html":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D","download_json":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D.json","view_paper":"https://pith.science/paper/ROUR7ERE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.28823&json=true","fetch_graph":"https://pith.science/api/pith-number/ROUR7ERENO2XTXMWINLRLJNL4D/graph.json","fetch_events":"https://pith.science/api/pith-number/ROUR7ERENO2XTXMWINLRLJNL4D/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D/action/storage_attestation","attest_author":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D/action/author_attestation","sign_citation":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D/action/citation_signature","submit_replication":"https://pith.science/pith/ROUR7ERENO2XTXMWINLRLJNL4D/action/replication_record"}},"created_at":"2026-06-30T01:16:53.307225+00:00","updated_at":"2026-06-30T01:16:53.307225+00:00"}