{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:6L7Q2QPMPSDX25LDVRZRGOC6FV","short_pith_number":"pith:6L7Q2QPM","schema_version":"1.0","canonical_sha256":"f2ff0d41ec7c877d7563ac7313385e2d654717640e6de4eb2ecad1d658af7249","source":{"kind":"arxiv","id":"2301.13848","version":1},"attestation_state":"computed","paper":{"title":"Benchmarking Large Language Models for News Summarization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Esin Durmus, Faisal Ladhak, Kathleen McKeown, Percy Liang, Tatsunori B. Hashimoto, Tianyi Zhang","submitted_at":"2023-01-31T18:46:19Z","abstract_excerpt":"Large language models (LLMs) have shown promise for automatic summarization but the reasons behind their successes are poorly understood. By conducting a human evaluation on ten LLMs across different pretraining methods, prompts, and model scales, we make two important observations. First, we find instruction tuning, and not model size, is the key to the LLM's zero-shot summarization capability. Second, existing studies have been limited by low-quality references, leading to underestimates of human performance and lower few-shot and finetuning performance. To better evaluate LLMs, we perform h"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2301.13848","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2023-01-31T18:46:19Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"f030412af9cf5d03f6334e5c0a23d2731a374ee3089fd1d8583227329107077e","abstract_canon_sha256":"283fb84edcb25ca49fb370aa86584230d0ef8daf80c0a00fb277814288521f72"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T05:37:19.544570Z","signature_b64":"97uytMc2aCkLVnh5ujXfhh+OcqjAdJfrnkUvekDOEI6wfYg8Y/B6cjMnhI7p8DGG01MV1tb5qUEGTGQiVFUABg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f2ff0d41ec7c877d7563ac7313385e2d654717640e6de4eb2ecad1d658af7249","last_reissued_at":"2026-07-05T05:37:19.544093Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T05:37:19.544093Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Benchmarking Large Language Models for News Summarization","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Esin Durmus, Faisal Ladhak, Kathleen McKeown, Percy Liang, Tatsunori B. Hashimoto, Tianyi Zhang","submitted_at":"2023-01-31T18:46:19Z","abstract_excerpt":"Large language models (LLMs) have shown promise for automatic summarization but the reasons behind their successes are poorly understood. By conducting a human evaluation on ten LLMs across different pretraining methods, prompts, and model scales, we make two important observations. First, we find instruction tuning, and not model size, is the key to the LLM's zero-shot summarization capability. Second, existing studies have been limited by low-quality references, leading to underestimates of human performance and lower few-shot and finetuning performance. To better evaluate LLMs, we perform h"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2301.13848","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2301.13848/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2301.13848","created_at":"2026-07-05T05:37:19.544149+00:00"},{"alias_kind":"arxiv_version","alias_value":"2301.13848v1","created_at":"2026-07-05T05:37:19.544149+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2301.13848","created_at":"2026-07-05T05:37:19.544149+00:00"},{"alias_kind":"pith_short_12","alias_value":"6L7Q2QPMPSDX","created_at":"2026-07-05T05:37:19.544149+00:00"},{"alias_kind":"pith_short_16","alias_value":"6L7Q2QPMPSDX25LD","created_at":"2026-07-05T05:37:19.544149+00:00"},{"alias_kind":"pith_short_8","alias_value":"6L7Q2QPM","created_at":"2026-07-05T05:37:19.544149+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.28044","citing_title":"A Tree-of-Thoughts Inspired Hybrid Approach for Legal Case Judgement Summarization using LLMs","ref_index":126,"is_internal_anchor":false},{"citing_arxiv_id":"2306.14048","citing_title":"H$_2$O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models","ref_index":4,"is_internal_anchor":false},{"citing_arxiv_id":"2404.18930","citing_title":"Hallucination of Multimodal Large Language Models: A Survey","ref_index":212,"is_internal_anchor":false},{"citing_arxiv_id":"2604.16864","citing_title":"HieraSparse: Hierarchical Semi-Structured Sparse KV Attention","ref_index":7,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV","json":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV.json","graph_json":"https://pith.science/api/pith-number/6L7Q2QPMPSDX25LDVRZRGOC6FV/graph.json","events_json":"https://pith.science/api/pith-number/6L7Q2QPMPSDX25LDVRZRGOC6FV/events.json","paper":"https://pith.science/paper/6L7Q2QPM"},"agent_actions":{"view_html":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV","download_json":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV.json","view_paper":"https://pith.science/paper/6L7Q2QPM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2301.13848&json=true","fetch_graph":"https://pith.science/api/pith-number/6L7Q2QPMPSDX25LDVRZRGOC6FV/graph.json","fetch_events":"https://pith.science/api/pith-number/6L7Q2QPMPSDX25LDVRZRGOC6FV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV/action/storage_attestation","attest_author":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV/action/author_attestation","sign_citation":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV/action/citation_signature","submit_replication":"https://pith.science/pith/6L7Q2QPMPSDX25LDVRZRGOC6FV/action/replication_record"}},"created_at":"2026-07-05T05:37:19.544149+00:00","updated_at":"2026-07-05T05:37:19.544149+00:00"}