{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:LG54SK4ADYIOEUIIXOR5SDCDYZ","short_pith_number":"pith:LG54SK4A","schema_version":"1.0","canonical_sha256":"59bbc92b801e10e25108bba3d90c43c66eee80eabf611e35ed94f4ec9fed11b9","source":{"kind":"arxiv","id":"2606.02334","version":1},"attestation_state":"computed","paper":{"title":"Less Is More? When Dataset Context Hurts LLM-Generated Dataset Descriptions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Arunav Das, Elena Simperl, Johanna Walker, Klaus Diepold, Lisa-Yao Gan","submitted_at":"2026-06-01T14:44:45Z","abstract_excerpt":"Dataset search and reuse are strongly constrained by the quality of metadata such as natural language descriptions, which are often sparse or inconsistent. Although large language models (LLMs) can generate such descriptions automatically, little empirical guidance exists on what makes a good dataset description and what dataset context LLMs actually need. We study these questions through a literature-grounded framework of dataset description quality and a large-scale ablation study using 252 datasets (1,336 CSV files) from the European data portal data.europa.eu. We generate descriptions with"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.02334","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.DB","submitted_at":"2026-06-01T14:44:45Z","cross_cats_sorted":[],"title_canon_sha256":"63e37b96a4e7209d72d82c8afa73464fc64e9ca939c3c30e7a2577c9cc047330","abstract_canon_sha256":"4a06d749546a42b64a184174ff47ce16cfe94dc718a8e3027bc3a9324ed68d08"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T03:04:56.379991Z","signature_b64":"9mptQTxP5P7Qk6MAGgLR5tBeeNmLxXbIh3tQGDvu2QzLs/pl5y+ajE20XYvrhRDTAsXr/HEZIya56wmBilluAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"59bbc92b801e10e25108bba3d90c43c66eee80eabf611e35ed94f4ec9fed11b9","last_reissued_at":"2026-06-02T03:04:56.379641Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T03:04:56.379641Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Less Is More? When Dataset Context Hurts LLM-Generated Dataset Descriptions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.DB","authors_text":"Arunav Das, Elena Simperl, Johanna Walker, Klaus Diepold, Lisa-Yao Gan","submitted_at":"2026-06-01T14:44:45Z","abstract_excerpt":"Dataset search and reuse are strongly constrained by the quality of metadata such as natural language descriptions, which are often sparse or inconsistent. Although large language models (LLMs) can generate such descriptions automatically, little empirical guidance exists on what makes a good dataset description and what dataset context LLMs actually need. We study these questions through a literature-grounded framework of dataset description quality and a large-scale ablation study using 252 datasets (1,336 CSV files) from the European data portal data.europa.eu. We generate descriptions with"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.02334","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.02334/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.02334","created_at":"2026-06-02T03:04:56.379708+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.02334v1","created_at":"2026-06-02T03:04:56.379708+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.02334","created_at":"2026-06-02T03:04:56.379708+00:00"},{"alias_kind":"pith_short_12","alias_value":"LG54SK4ADYIO","created_at":"2026-06-02T03:04:56.379708+00:00"},{"alias_kind":"pith_short_16","alias_value":"LG54SK4ADYIOEUII","created_at":"2026-06-02T03:04:56.379708+00:00"},{"alias_kind":"pith_short_8","alias_value":"LG54SK4A","created_at":"2026-06-02T03:04:56.379708+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ","json":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ.json","graph_json":"https://pith.science/api/pith-number/LG54SK4ADYIOEUIIXOR5SDCDYZ/graph.json","events_json":"https://pith.science/api/pith-number/LG54SK4ADYIOEUIIXOR5SDCDYZ/events.json","paper":"https://pith.science/paper/LG54SK4A"},"agent_actions":{"view_html":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ","download_json":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ.json","view_paper":"https://pith.science/paper/LG54SK4A","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.02334&json=true","fetch_graph":"https://pith.science/api/pith-number/LG54SK4ADYIOEUIIXOR5SDCDYZ/graph.json","fetch_events":"https://pith.science/api/pith-number/LG54SK4ADYIOEUIIXOR5SDCDYZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ/action/storage_attestation","attest_author":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ/action/author_attestation","sign_citation":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ/action/citation_signature","submit_replication":"https://pith.science/pith/LG54SK4ADYIOEUIIXOR5SDCDYZ/action/replication_record"}},"created_at":"2026-06-02T03:04:56.379708+00:00","updated_at":"2026-06-02T03:04:56.379708+00:00"}