{"work":{"id":"e6d53e5b-2180-482b-82ca-0e64d572c87f","openalex_id":null,"doi":null,"arxiv_id":"2504.13818","raw_key":null,"title":"Not All Rollouts are Useful: Down-Sampling Rollouts in LLM Reinforcement Learning","authors":null,"authors_text":null,"year":2025,"venue":"cs.LG","abstract":"Reinforcement learning with verifiable rewards (RLVR) has emerged as the leading approach for enhancing reasoning capabilities in large language models. However, it faces a fundamental compute and memory asymmetry: rollout generation is embarrassingly parallel and memory-light, whereas policy updates are communication-heavy and memory-intensive. To address this, we introduce PODS (Policy Optimization with Down-Sampling), which decouples rollout generation from policy updates by training only on a strategically selected subset of rollouts, maintaining learning quality while dramatically reducing update costs. We propose a principled subset selection criterion, max-variance down-sampling, that maximizes reward diversity, and provide an efficient $O(n\\log n)$ implementation. Empirically, Group Relative Policy Optimization (GRPO) with PODS achieves the peak test accuracy of vanilla GRPO at least $\\mathbf{1.7\\times}$ faster across the different reasoning benchmarks and hardware configurations we tested.","external_url":"https://arxiv.org/abs/2504.13818","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-23T02:42:26.181203+00:00","pith_arxiv_id":"2504.13818","created_at":"2026-05-10T07:16:54.880590+00:00","updated_at":"2026-05-23T02:42:26.181203+00:00","title_quality_ok":true,"display_title":"Not All Rollouts are Useful: Down-Sampling Rollouts in LLM Reinforcement Learning","render_title":"Not All Rollouts are Useful: Down-Sampling Rollouts in LLM Reinforcement Learning"},"hub":{"state":{"work_id":"e6d53e5b-2180-482b-82ca-0e64d572c87f","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":12,"external_cited_by_count":null,"distinct_field_count":3,"first_pith_cited_at":"2025-02-17T19:16:37+00:00","last_pith_cited_at":"2026-05-15T03:13:35+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-30T11:01:10.836469+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":1}],"polarity_counts":[{"context_polarity":"background","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}