{"work":{"id":"81d8781d-2933-4e89-97ee-9bbfc6d4ca0c","openalex_id":null,"doi":null,"arxiv_id":"2309.00267","raw_key":null,"title":"RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback","authors":null,"authors_text":"Harrison Lee, Samrat Phatale, Hassan Mansoor, Thomas Mesnard, Johan Ferret, Kellie Lu","year":2023,"venue":"cs.CL","abstract":"Reinforcement learning from human feedback (RLHF) has proven effective in aligning large language models (LLMs) with human preferences, but gathering high-quality preference labels is expensive. RL from AI Feedback (RLAIF), introduced in Bai et al., offers a promising alternative that trains the reward model (RM) on preferences generated by an off-the-shelf LLM. Across the tasks of summarization, helpful dialogue generation, and harmless dialogue generation, we show that RLAIF achieves comparable performance to RLHF. Furthermore, we take a step towards \"self-improvement\" by demonstrating that RLAIF can outperform a supervised fine-tuned baseline even when the AI labeler is the same size as the policy, or even the exact same checkpoint as the initial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that circumvents RM training by obtaining rewards directly from an off-the-shelf LLM during RL, which achieves superior performance to canonical RLAIF. Our results suggest that RLAIF can achieve performance on-par with using human feedback, offering a potential solution to the scalability limitations of RLHF.","external_url":"https://arxiv.org/abs/2309.00267","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T05:05:23.069417+00:00","pith_arxiv_id":"2309.00267","created_at":"2026-05-09T06:55:43.023955+00:00","updated_at":"2026-05-25T05:05:23.069417+00:00","title_quality_ok":true,"display_title":"RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback","render_title":"RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with AI Feedback"},"hub":{"state":{"work_id":"81d8781d-2933-4e89-97ee-9bbfc6d4ca0c","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":35,"external_cited_by_count":null,"distinct_field_count":8,"first_pith_cited_at":"2023-10-23T10:12:23+00:00","last_pith_cited_at":"2026-05-22T05:25:00+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-28T02:28:05.933414+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":8},{"context_role":"dataset","n":1},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":9},{"context_polarity":"extend","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}