{"total":21,"items":[{"citing_arxiv_id":"2607.01120","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Next-Generation Agentic Reinforcement Learning Systems Enable Self-Evolving Agents","primary_cat":"cs.DC","submitted_at":"2026-07-01T16:08:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Enterprise self-evolving agents require new agentic RL systems built around standardized trajectory data protocols, workload-to-learning data proxies, and automatic policy evolution control planes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05597","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AsyncWebRL: Efficient Multi-Step RL for Visual Web Agents","primary_cat":"cs.LG","submitted_at":"2026-06-04T02:18:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AsyncWebRL reports up to 2.9x training speedup and new SOTA on WebGym OOD split via async overlap plus constant normalizer in GRPO, with largest gains on harder tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04560","ref_index":83,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rollout-Level Advantage-Prioritized Experience Replay for GRPO","primary_cat":"cs.LG","submitted_at":"2026-06-03T07:47:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rollout-level advantage-prioritized experience replay for GRPO recycles high-advantage individual rollouts with age eviction and fresh-anchored batches to outperform standard GRPO on math benchmarks, with gains increasing with model size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03077","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Libra: Efficient Resource Management for Agentic RL Post-Training","primary_cat":"cs.LG","submitted_at":"2026-06-02T03:09:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Libra optimizes GPU allocation across rollout and training in agentic RL via an elastic hybrid pool and C-MLFQ scheduler based on tool-return causal signals, claiming up to 3.0x throughput and 2.5x faster reward convergence on 48 A800 GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30859","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DARTS: Distribution-Aware Active Rollout Trajectory Shaping for Accelerating LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DARTS accelerates LLM RL training up to 1.77x by distribution-aware trajectory sampling and adaptive redundancy allocation that shapes rollouts toward conciseness without performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28095","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SiDP: Memory-Efficient Data Parallelism for Offline LLM Inference","primary_cat":"cs.DC","submitted_at":"2026-05-27T07:52:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SiDP distributes model weights across a DP group with WaS and CaS modes to increase KV cache capacity by up to 1.8x and end-to-end throughput by up to 1.5x over vLLM on H20/H200/B200 GPUs for offline LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17570","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How Off-Policy Can GRPO Be? Mu-GRPO for Efficient LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-17T17:58:53+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mu-GRPO enables substantially more off-policy GRPO training for LLMs via relaxed clipping and negative-advantage veto in large staged batches, matching standard GRPO performance at ~2x training speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15565","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-15T03:13:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AstraFlow decouples RL components into autonomous dataflow services to natively support multi-policy agentic LLM training, elastic scaling, and cross-region execution with 2.7x speedup on math, code, search, and AgentBench workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08862","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BubbleSpec: Turning Long-Tail Bubbles into Speculative Rollout Drafts for Synchronous Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-09T10:21:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BubbleSpec exploits long-tail bubbles in synchronous RL by using faster ranks' idle time to pre-generate rollout drafts for speculative decoding, reducing steps by 50% and raising throughput up to 1.8x while preserving exact synchrony.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08639","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReLibra: Routing-Replay-Guided Load Balancing for MoE Training in Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-09T03:18:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ReLibra uses pre-known token-to-expert routing from RL rollouts to perform inter-batch expert reordering and intra-batch replication, delivering up to 1.6x higher throughput than Megatron-LM and 1.2x over oracle-equipped EPLB while staying within 6-10% of an ideal balanced baseline.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Liao,et al., \"Kimi k1.5: Scal- ing reinforcement learning with llms,\"arXiv preprint arXiv:2501.12599, 2025. [67] Y . Zhong, Z. Zhang, X. Song, H. Hu, C. Jin, B. Wu, N. Chen, Y . Chen, Y . Zhou, C. Wan,et al., \"Streamrl: Scalable, heterogeneous, and elastic rl for llms with disaggregated stream generation,\"arXiv preprint arXiv:2504.15930, 2025. 15 [68] J. Li, Y . Jiang, Y . Zhu, C. Wang, and H. Xu, \"Accelerat- ing distributed {MoE} training and inference with lina,\" inUSENIX ATC, 2023. [69] C. Hwang, W. Cui, Y . Xiong, Z. Yang, Z. Liu, H. Hu, Z. Wang, R. Salas, J. Jose, P. Ram,et al., \"Tutel: Adap- tive mixture-of-experts at scale,\"Proceedings of Ma- chine Learning and Systems, 2023. [70] X. Nie, P."},{"citing_arxiv_id":"2605.08520","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FlashEvolve: Accelerating Agent Self-Evolution with Asynchronous Stage Orchestration","primary_cat":"cs.LG","submitted_at":"2026-05-08T22:04:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlashEvolve accelerates LLM agent self-evolution via asynchronous stage orchestration and inspectable language-space staleness handling, reporting 3.5-4.9x proposal throughput gains over synchronous baselines on GEPA workloads.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Pytorch fsdp: experiences on scaling fully sharded data parallel.arXiv preprint arXiv:2304.11277, 2023. [37] L. Zheng, L. Yin, Z. Xie, C. Sun, J. Huang, C. H. Yu, S. Cao, C. Kozyrakis, I. Stoica, J. E. Gonzalez, et al. Sglang: Efficient execution of structured language model programs.Advances in neural information processing systems, 37:62557-62583, 2024. [38] Y . Zhong, Z. Zhang, X. Song, H. Hu, C. Jin, B. Wu, N. Chen, Y . Chen, Y . Zhou, C. Wan, et al. Streamrl: Scalable, heterogeneous, and elastic rl for llms with disaggregated stream generation. arXiv preprint arXiv:2504.15930, 2025. 12"},{"citing_arxiv_id":"2605.07244","ref_index":121,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Experience Sharing in Mutual Reinforcement Learning for Heterogeneous Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T05:01:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mutual Reinforcement Learning allows heterogeneous LLMs to exchange experience through mechanisms like Peer Rollout Pooling, Cross-Policy GRPO Advantage Sharing, and Success-Gated Transfer, with outcome-level sharing identified as favorable on the stability-support trade-off.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06534","ref_index":96,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ROSE: Rollout On Serving GPUs via Cooperative Elasticity for Agentic RL","primary_cat":"cs.DC","submitted_at":"2026-05-07T16:33:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ROSE is a system for cooperative elasticity that co-locates serving and rollout models on shared GPUs, delivering 1.3-3.3x higher end-to-end throughput than fixed-resource baselines while preserving serving SLOs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23945","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Accelerating Long-Tail Generation in Synchronous RLHF Training via Adaptive Tensor Parallelism","primary_cat":"cs.AI","submitted_at":"2026-05-03T05:53:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PAT adaptively reconfigures tensor parallelism in RLHF generation using predictor-guided decisions and lightweight state updates, cutting generation latency by up to 34.6%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26256","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DORA: A Scalable Asynchronous Reinforcement Learning System for Language Model Training","primary_cat":"cs.LG","submitted_at":"2026-04-29T03:25:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DORA's multi-version streaming rollout enables 2-3x higher throughput in asynchronous RL for LLMs while preserving convergence by maintaining policy consistency, data integrity, and bounded staleness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23838","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"JigsawRL: Assembling RL Pipelines for Efficient LLM Post-Training","primary_cat":"cs.LG","submitted_at":"2026-04-26T18:45:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"JigsawRL achieves up to 1.85x higher throughput in LLM RL pipelines via pipeline multiplexing, sub-stage graphs, and look-ahead scheduling compared to prior systems.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"reasoning [8, 25] and external tool-calling [12, 23] introduce highly stochastic and long-tailed sequence distributions. Existing RL systems explore asynchronous execution [13, 44, 68] and time multiplexing [55, 56, 69] to improve the effi- ciency, but they still fail to eliminate the workload imbalance both within and between stages. As shown in Figure 2(b), one-step off-policy execution such as StreamRL [68] overlaps rollout and training across workers, but the imbalance within 2Monetary costs are estimated based on the on-demand pricing of AWS EC2 A100 instances [4] ($4.10 per GPU hour). 1 arXiv:2604.23838v1 [cs.LG] 26 Apr 2026 4 1 0 0 1 7 A on-policy rollout A one-step off-policy rollout A ≥ two-step off-policy rollout … (d) RollMux: Time-Multiplexing with Disaggregation"},{"citing_arxiv_id":"2604.09107","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TensorHub: Scalable and Elastic Weight Transfer for LLM RL Training","primary_cat":"cs.DC","submitted_at":"2026-04-10T08:40:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TensorHub uses Reference-Oriented Storage to enable scalable weight transfer in LLM RL training by referencing replicated GPU weights, achieving up to 19x reduction in cross-datacenter stall time.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tests operate solely at the reference-request layer, without involving data transfer, and therefore do not require RDMA NICs or GPUs. Because all requests originate from a single process, the resulting executions are deterministic and repro- ducible, greatly simplifying debugging. This approach to sim- ulated concurrency testing is inspired by FoundationDB [46], which has demonstrated in production its effectiveness at uncovering subtle concurrency bugs in distributed systems. End-to-End Checksum.To verify end-to-end correctness, we further implement checksums. Upon publishing, the client computes the checksum of each tensor and attaches it to the reference. When another client acquires this reference as a"},{"citing_arxiv_id":"2512.12476","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HetRL: Efficient Reinforcement Learning for LLMs in Heterogeneous Environments","primary_cat":"cs.DC","submitted_at":"2025-12-13T22:20:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HetRL delivers up to 9.17x higher throughput for LLM RL training on heterogeneous GPUs by using hybrid and ILP-based schedulers to solve a joint optimization problem over computation and data dependencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.14617","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seer: Online Context Learning for Fast Synchronous LLM Reinforcement Learning","primary_cat":"cs.DC","submitted_at":"2025-11-18T16:12:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Seer improves synchronous LLM RL rollout throughput by up to 2.04x and reduces long-tail latency by 72-94% via divided rollout, context-aware scheduling, and adaptive grouped speculative decoding based on prompt similarity observations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.19225","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RLBoost: Harvesting Preemptible Resources for Cost-Efficient Reinforcement Learning on LLMs","primary_cat":"cs.DC","submitted_at":"2025-10-22T04:19:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RLBoost harvests preemptible GPUs for RL rollout via a hybrid architecture with adaptive offload, pull-based transfer, and token-level migration, delivering 1.51x-1.97x throughput and 28-49% better cost efficiency than on-demand-only setups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.24298","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning","primary_cat":"cs.LG","submitted_at":"2025-05-30T07:18:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AReaL decouples generation and training in LLM reinforcement learning to achieve up to 2.77x speedup with matched or better performance on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}