{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:UKSLWPQT2Z3UNZ3JUCL37JYPQF","short_pith_number":"pith:UKSLWPQT","schema_version":"1.0","canonical_sha256":"a2a4bb3e13d67746e769a097bfa70f8148c475fb3b8ffa86c331751c38e0255a","source":{"kind":"arxiv","id":"2109.10862","version":2},"attestation_state":"computed","paper":{"title":"Recursively Summarizing Books with Human Feedback","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Recursive decomposition lets models summarize entire books after humans give feedback only on short sections.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Daniel M. Ziegler, Jan Leike, Jeff Wu, Long Ouyang, Nisan Stiennon, Paul Christiano, Ryan Lowe","submitted_at":"2021-09-22T17:34:18Z","abstract_excerpt":"A major challenge for scaling machine learning is training models to perform tasks that are very difficult or time-consuming for humans to evaluate. We present progress on this problem on the task of abstractive summarization of entire fiction novels. Our method combines learning from human feedback with recursive task decomposition: we use models trained on smaller parts of the task to assist humans in giving feedback on the broader task. We collect a large volume of demonstrations and comparisons from human labelers, and fine-tune GPT-3 using behavioral cloning and reward modeling to do summ"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2109.10862","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-09-22T17:34:18Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"612ddce1253fc4e6a3ead618e152978b1fbe6ff31ad268ab30b5b82b9e65fc6e","abstract_canon_sha256":"ecaab0f0204b68cab469b5a1be1cb0e5693ea4b755132f7c575933fc96022bdb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.697225Z","signature_b64":"5n+v4tOAmms5ajGNA+c42Iub3gqC6Ixo9pa1IxIQ5GTkJJ2wJqbEUKJxZxHzfgfWjevCCzfZ81R5Zg015HEwDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a2a4bb3e13d67746e769a097bfa70f8148c475fb3b8ffa86c331751c38e0255a","last_reissued_at":"2026-05-17T23:38:13.696569Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.696569Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Recursively Summarizing Books with Human Feedback","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Recursive decomposition lets models summarize entire books after humans give feedback only on short sections.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Daniel M. Ziegler, Jan Leike, Jeff Wu, Long Ouyang, Nisan Stiennon, Paul Christiano, Ryan Lowe","submitted_at":"2021-09-22T17:34:18Z","abstract_excerpt":"A major challenge for scaling machine learning is training models to perform tasks that are very difficult or time-consuming for humans to evaluate. We present progress on this problem on the task of abstractive summarization of entire fiction novels. Our method combines learning from human feedback with recursive task decomposition: we use models trained on smaller parts of the task to assist humans in giving feedback on the broader task. We collect a large volume of demonstrations and comparisons from human labelers, and fine-tune GPT-3 using behavioral cloning and reward modeling to do summ"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our resulting model generates sensible summaries of entire books, even matching the quality of human-written summaries in a few cases (~5% of books). We achieve state-of-the-art results on the recent BookSum dataset for book-length summarization.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That summaries of summaries retain enough information and fidelity for the final output to remain faithful to the original book when humans never see the full text.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Recursive decomposition plus human feedback lets GPT-3 produce book-length summaries that reach human quality on a few cases and set new records on BookSum and NarrativeQA.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Recursive decomposition lets models summarize entire books after humans give feedback only on short sections.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ac2d3692783a95b8708e95936c8d8684c6c8e48ee69de27882d9644785e03bea"},"source":{"id":"2109.10862","kind":"arxiv","version":2},"verdict":{"id":"4593b665-e34c-45c1-8e5b-878911626631","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T15:22:19.704135Z","strongest_claim":"Our resulting model generates sensible summaries of entire books, even matching the quality of human-written summaries in a few cases (~5% of books). We achieve state-of-the-art results on the recent BookSum dataset for book-length summarization.","one_line_summary":"Recursive decomposition plus human feedback lets GPT-3 produce book-length summaries that reach human quality on a few cases and set new records on BookSum and NarrativeQA.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That summaries of summaries retain enough information and fidelity for the final output to remain faithful to the original book when humans never see the full text.","pith_extraction_headline":"Recursive decomposition lets models summarize entire books after humans give feedback only on short sections."},"references":{"count":12,"sample":[{"doi":"","year":null,"title":"This subtask can be decomposed even further if necessary","work_id":"0edfbdfc-d42f-42ee-85ca-86cabf1cf023","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"answer_directly, which returns an actual answer to the task, synthesizing the answers to subtasks In general, both decompose_if_needed and answer_directly could be learned and implemented by an ML mod","work_id":"a86a2eb4-2be7-457f-b24e-1f5157bb1f02","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"So gratuitously including small details is generally penalized, and omitting important details is also penalized","work_id":"18f5ae1f-6b0b-4c05-a46f-7f852267c26b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Accuracy: All information in the summary should faithfully reﬂect the original passage","work_id":"0c69f10e-46a6-40b6-87b3-b7641b71bf4f","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"We also have a fourth criteria which is primarily applicable at higher height","work_id":"16e36ec9-96ac-4895-be8f-1593c9629814","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":12,"snapshot_sha256":"2106365b4047f74aef9e8b24abb976190c515553ae0de5fc813f998be3caf7e5","internal_anchors":0},"formal_canon":{"evidence_count":3,"snapshot_sha256":"20dab4a935667ae522f13712112998461eb3e92f0ee4fd34c6aa5a220b887a4b"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2109.10862","created_at":"2026-05-17T23:38:13.696676+00:00"},{"alias_kind":"arxiv_version","alias_value":"2109.10862v2","created_at":"2026-05-17T23:38:13.696676+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2109.10862","created_at":"2026-05-17T23:38:13.696676+00:00"},{"alias_kind":"pith_short_12","alias_value":"UKSLWPQT2Z3U","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"UKSLWPQT2Z3UNZ3J","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"UKSLWPQT","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2504.12501","citing_title":"Reinforcement Learning from Human Feedback","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2404.13076","citing_title":"LLM Evaluators Recognize and Favor Their Own Generations","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20724","citing_title":"CALMem : Application-Layer Dual Memory for Conversational AI","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2507.11473","citing_title":"Chain of Thought Monitorability: A New and Fragile Opportunity for AI Safety","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2304.06767","citing_title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","ref_index":107,"is_internal_anchor":true},{"citing_arxiv_id":"2211.03540","citing_title":"Measuring Progress on Scalable Oversight for Large Language Models","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2205.14334","citing_title":"Teaching Models to Express Their Uncertainty in Words","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2204.00598","citing_title":"Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2601.21459","citing_title":"HER: Human-like Reasoning and Reinforcement Learning for LLM Role-playing","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2309.17400","citing_title":"Directly Fine-Tuning Diffusion Models on Differentiable Rewards","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2401.18059","citing_title":"RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval","ref_index":111,"is_internal_anchor":true},{"citing_arxiv_id":"2209.07753","citing_title":"Code as Policies: Language Model Programs for Embodied Control","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28281","citing_title":"Corruption-robust Offline Multi-agent Reinforcement Learning From Human Feedback","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2302.12192","citing_title":"Aligning Text-to-Image Models using Human Feedback","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2308.08998","citing_title":"Reinforced Self-Training (ReST) for Language Modeling","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2310.04451","citing_title":"AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2304.03442","citing_title":"Generative Agents: Interactive Simulacra of Human Behavior","ref_index":111,"is_internal_anchor":true},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":253,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07012","citing_title":"DTCRS: Dynamic Tree Construction for Recursive Summarization","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2109.01652","citing_title":"Finetuned Language Models Are Zero-Shot Learners","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11435","citing_title":"Think Before you Write: QA-Guided Reasoning for Character Descriptions in Books","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20131","citing_title":"Whose Story Gets Told? Positionality and Bias in LLM Summaries of Life Narratives","ref_index":94,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":3,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF","json":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF.json","graph_json":"https://pith.science/api/pith-number/UKSLWPQT2Z3UNZ3JUCL37JYPQF/graph.json","events_json":"https://pith.science/api/pith-number/UKSLWPQT2Z3UNZ3JUCL37JYPQF/events.json","paper":"https://pith.science/paper/UKSLWPQT"},"agent_actions":{"view_html":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF","download_json":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF.json","view_paper":"https://pith.science/paper/UKSLWPQT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2109.10862&json=true","fetch_graph":"https://pith.science/api/pith-number/UKSLWPQT2Z3UNZ3JUCL37JYPQF/graph.json","fetch_events":"https://pith.science/api/pith-number/UKSLWPQT2Z3UNZ3JUCL37JYPQF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF/action/storage_attestation","attest_author":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF/action/author_attestation","sign_citation":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF/action/citation_signature","submit_replication":"https://pith.science/pith/UKSLWPQT2Z3UNZ3JUCL37JYPQF/action/replication_record"}},"created_at":"2026-05-17T23:38:13.696676+00:00","updated_at":"2026-05-17T23:38:13.696676+00:00"}