{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:LFGGTTFUCACJRN6OTZKAT36K2L","short_pith_number":"pith:LFGGTTFU","schema_version":"1.0","canonical_sha256":"594c69ccb4100498b7ce9e5409efcad2c9501f94444651d138a34d91d8457e49","source":{"kind":"arxiv","id":"2606.24828","version":1},"attestation_state":"computed","paper":{"title":"Less is More: Quality-Aware Training Data Selection for Scientific Summarization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Grigorios Tsoumakas, Maria Nefeli Paraskevopoulou, Tatiana Passali","submitted_at":"2026-06-23T17:12:06Z","abstract_excerpt":"Scientific long-document summarization datasets commonly treat author-written abstracts as gold reference summaries, although their quality and alignment with the source article vary. At the same time, publicly available scientific summarization datasets remain limited in scale and structure for modern long-context models. In this work, we address both challenges by a) constructing and releasing one of the largest biomedical and life science datasets for long-document summarization, containing 1.88 million PMC articles, and b) analyzing the reference quality of author-written abstracts with so"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.24828","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-23T17:12:06Z","cross_cats_sorted":[],"title_canon_sha256":"2cd9b62d187becdf54b0999181835c86d63513ff64acc583026efefba1e4d065","abstract_canon_sha256":"a677d6a2934886ff885b7a335880552851466037cf1c70ae64eba93cb43e100b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-24T01:15:43.403060Z","signature_b64":"zjZQZUuxpD572J2fLe0pfXDEMKq+dm5bPSzHTUr/QnegkuRZlmU2ILpg/JYCfqWdvlGvjT/yIo9WVPP6rwyPAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"594c69ccb4100498b7ce9e5409efcad2c9501f94444651d138a34d91d8457e49","last_reissued_at":"2026-06-24T01:15:43.402664Z","signature_status":"signed_v1","first_computed_at":"2026-06-24T01:15:43.402664Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Less is More: Quality-Aware Training Data Selection for Scientific Summarization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Grigorios Tsoumakas, Maria Nefeli Paraskevopoulou, Tatiana Passali","submitted_at":"2026-06-23T17:12:06Z","abstract_excerpt":"Scientific long-document summarization datasets commonly treat author-written abstracts as gold reference summaries, although their quality and alignment with the source article vary. At the same time, publicly available scientific summarization datasets remain limited in scale and structure for modern long-context models. In this work, we address both challenges by a) constructing and releasing one of the largest biomedical and life science datasets for long-document summarization, containing 1.88 million PMC articles, and b) analyzing the reference quality of author-written abstracts with so"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24828","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.24828/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.24828","created_at":"2026-06-24T01:15:43.402725+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.24828v1","created_at":"2026-06-24T01:15:43.402725+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24828","created_at":"2026-06-24T01:15:43.402725+00:00"},{"alias_kind":"pith_short_12","alias_value":"LFGGTTFUCACJ","created_at":"2026-06-24T01:15:43.402725+00:00"},{"alias_kind":"pith_short_16","alias_value":"LFGGTTFUCACJRN6O","created_at":"2026-06-24T01:15:43.402725+00:00"},{"alias_kind":"pith_short_8","alias_value":"LFGGTTFU","created_at":"2026-06-24T01:15:43.402725+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L","json":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L.json","graph_json":"https://pith.science/api/pith-number/LFGGTTFUCACJRN6OTZKAT36K2L/graph.json","events_json":"https://pith.science/api/pith-number/LFGGTTFUCACJRN6OTZKAT36K2L/events.json","paper":"https://pith.science/paper/LFGGTTFU"},"agent_actions":{"view_html":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L","download_json":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L.json","view_paper":"https://pith.science/paper/LFGGTTFU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.24828&json=true","fetch_graph":"https://pith.science/api/pith-number/LFGGTTFUCACJRN6OTZKAT36K2L/graph.json","fetch_events":"https://pith.science/api/pith-number/LFGGTTFUCACJRN6OTZKAT36K2L/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L/action/storage_attestation","attest_author":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L/action/author_attestation","sign_citation":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L/action/citation_signature","submit_replication":"https://pith.science/pith/LFGGTTFUCACJRN6OTZKAT36K2L/action/replication_record"}},"created_at":"2026-06-24T01:15:43.402725+00:00","updated_at":"2026-06-24T01:15:43.402725+00:00"}