{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:XJ4TZBEW6OFGD77EW4OMMYNUN5","short_pith_number":"pith:XJ4TZBEW","schema_version":"1.0","canonical_sha256":"ba793c8496f38a61ffe4b71cc661b46f71675f7aa75767423913d4da599535aa","source":{"kind":"arxiv","id":"2310.06452","version":3},"attestation_state":"computed","paper":{"title":"Understanding the Effects of RLHF on LLM Generalisation and Diversity","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Christoforos Nalmpantis, Edward Grefenstette, Eric Hambro, Ishita Mediratta, Jelena Luketina, Roberta Raileanu, Robert Kirk","submitted_at":"2023-10-10T09:25:44Z","abstract_excerpt":"Large language models (LLMs) fine-tuned with reinforcement learning from human feedback (RLHF) have been used in some of the most widely deployed AI models to date, such as OpenAI's ChatGPT or Anthropic's Claude. While there has been significant work developing these methods, our understanding of the benefits and downsides of each stage in RLHF is still limited. To fill this gap, we present an extensive analysis of how each stage of the process (i.e. supervised fine-tuning (SFT), reward modelling, and RLHF) affects two key properties: out-of-distribution (OOD) generalisation and output diversi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2310.06452","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2023-10-10T09:25:44Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"9e9759544ee8ceb86f206016a83e70d3ae4273a792ecf0a4e08bcc752d0d1d40","abstract_canon_sha256":"9588cc882ba242d60e4b2af32935ac29df072e5687afa32cf931a44ea82db0de"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-19T02:28:36.403682Z","signature_b64":"01PTiZ9BUtWJmj19Hrf1w+YnTMfsIzF16VTFaZwqhUahtj9PglbRc0sCv0E1vbEr8xBrM3H8J02d2T8oyno4Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ba793c8496f38a61ffe4b71cc661b46f71675f7aa75767423913d4da599535aa","last_reissued_at":"2026-05-19T02:28:36.401174Z","signature_status":"signed_v1","first_computed_at":"2026-05-19T02:28:36.401174Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Understanding the Effects of RLHF on LLM Generalisation and Diversity","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Christoforos Nalmpantis, Edward Grefenstette, Eric Hambro, Ishita Mediratta, Jelena Luketina, Roberta Raileanu, Robert Kirk","submitted_at":"2023-10-10T09:25:44Z","abstract_excerpt":"Large language models (LLMs) fine-tuned with reinforcement learning from human feedback (RLHF) have been used in some of the most widely deployed AI models to date, such as OpenAI's ChatGPT or Anthropic's Claude. While there has been significant work developing these methods, our understanding of the benefits and downsides of each stage in RLHF is still limited. To fill this gap, we present an extensive analysis of how each stage of the process (i.e. supervised fine-tuning (SFT), reward modelling, and RLHF) affects two key properties: out-of-distribution (OOD) generalisation and output diversi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2310.06452","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2310.06452","created_at":"2026-05-19T02:28:36.401269+00:00"},{"alias_kind":"arxiv_version","alias_value":"2310.06452v3","created_at":"2026-05-19T02:28:36.401269+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.06452","created_at":"2026-05-19T02:28:36.401269+00:00"},{"alias_kind":"pith_short_12","alias_value":"XJ4TZBEW6OFG","created_at":"2026-05-19T02:28:36.401269+00:00"},{"alias_kind":"pith_short_16","alias_value":"XJ4TZBEW6OFGD77E","created_at":"2026-05-19T02:28:36.401269+00:00"},{"alias_kind":"pith_short_8","alias_value":"XJ4TZBEW","created_at":"2026-05-19T02:28:36.401269+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":21,"internal_anchor_count":21,"sample":[{"citing_arxiv_id":"2505.23912","citing_title":"LoVeC: Reinforcement Learning for Better Verbalized Confidence in Long-Form Generations","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2507.06419","citing_title":"Reward Models Can Improve Themselves: Reward-Guided Adversarial Failure Mode Discovery for Robust Reward Modeling","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2507.21934","citing_title":"Culinary Crossroads: A RAG Framework for Enhancing Diversity in Cross-Cultural Recipe Adaptation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2508.16771","citing_title":"EyeMulator: Improving Code Language Models by Mimicking Human Visual Attention","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2509.08827","citing_title":"A Survey of Reinforcement Learning for Large Reasoning Models","ref_index":255,"is_internal_anchor":true},{"citing_arxiv_id":"2601.12538","citing_title":"Agentic Reasoning for Large Language Models","ref_index":230,"is_internal_anchor":true},{"citing_arxiv_id":"2403.07691","citing_title":"ORPO: Monolithic Preference Optimization without Reference Model","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2602.13280","citing_title":"BEAGLE: Behavior-Enforced Agent for Grounded Learner Emulation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12522","citing_title":"Differences in Text Generated by Diffusion and Autoregressive Language Models","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08862","citing_title":"BubbleSpec: Turning Long-Tail Bubbles into Speculative Rollout Drafts for Synchronous Reinforcement Learning","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09995","citing_title":"Annotations Mitigate Post-Training Mode Collapse","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10716","citing_title":"What should post-training optimize? A test-time scaling law perspective","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00195","citing_title":"Diversity in Large Language Models under Supervised Fine-Tuning","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25634","citing_title":"The Surprising Universality of LLM Outputs: A Real-Time Verification Primitive","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06540","citing_title":"Ex Ante Evaluation of AI-Induced Idea Diversity Collapse","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06040","citing_title":"Novelty-based Tree-of-Thought Search for LLM Reasoning and Planning","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01123","citing_title":"PERSA: Reinforcement Learning for Professor-Style Personalized Feedback with LLMs","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2310.13548","citing_title":"Towards Understanding Sycophancy in Language Models","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07632","citing_title":"Post-training makes large language models less human-like","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18519","citing_title":"LLM Safety From Within: Detecting Harmful Content with Internal Representations","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00195","citing_title":"Diversity in Large Language Models under Supervised Fine-Tuning","ref_index":28,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5","json":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5.json","graph_json":"https://pith.science/api/pith-number/XJ4TZBEW6OFGD77EW4OMMYNUN5/graph.json","events_json":"https://pith.science/api/pith-number/XJ4TZBEW6OFGD77EW4OMMYNUN5/events.json","paper":"https://pith.science/paper/XJ4TZBEW"},"agent_actions":{"view_html":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5","download_json":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5.json","view_paper":"https://pith.science/paper/XJ4TZBEW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2310.06452&json=true","fetch_graph":"https://pith.science/api/pith-number/XJ4TZBEW6OFGD77EW4OMMYNUN5/graph.json","fetch_events":"https://pith.science/api/pith-number/XJ4TZBEW6OFGD77EW4OMMYNUN5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5/action/storage_attestation","attest_author":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5/action/author_attestation","sign_citation":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5/action/citation_signature","submit_replication":"https://pith.science/pith/XJ4TZBEW6OFGD77EW4OMMYNUN5/action/replication_record"}},"created_at":"2026-05-19T02:28:36.401269+00:00","updated_at":"2026-05-19T02:28:36.401269+00:00"}