{"total":52,"items":[{"citing_arxiv_id":"2605.31455","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Decoupled Rollouts and Importance-Weighted Fine-Tuning for Efficient Multi-Turn Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T15:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT achieves multi-turn RL performance via offline importance-weighted SFT by leveraging the equivalence of KL-regularized RL to weighted supervised learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30749","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FLAG: Flow Policy MaxEnt-RL by Latent Augmented Guidance","primary_cat":"cs.LG","submitted_at":"2026-05-29T02:25:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FLAG augments state space with flow latent variable to optimize a proxy MaxEnt-RL objective, enabling expressive policies with limited importance samples in high-dimensional control.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28409","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Post-training of LLMs for Code Generation With Offline Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-27T12:43:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Offline RL post-training boosts code generation performance in LLMs, with larger gains for small models and hard problems, using pre-collected datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23551","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Goal-Conditioned Agents that Learn Everything All at Once","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:17:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LEO enables efficient all-goals learning in goal-conditioned RL by jointly predicting for all goals in one network pass, yielding >250x speedup over relabelling and better performance on Craftax.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18675","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COOPO: Cyclic Offline-Online Policy Optimization Algorithm","primary_cat":"cs.LG","submitted_at":"2026-05-18T17:15:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COOPO is a cyclic offline-online RL algorithm that repeatedly anchors the policy to a dataset via KL-regularized updates then fine-tunes online, claiming better sample efficiency and monotonic improvement under coverage assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18580","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Outcome Looks Right But Discipline Fails: Trace-Based Evaluation Under Hidden Competitor State","primary_cat":"cs.AI","submitted_at":"2026-05-18T15:58:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces discipline stability, a trace-based evaluation paradigm for checking if RL agents maintain behavioral discipline like rule-based competitors in hidden-state competitive settings such as hotel pricing and bidding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14497","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ROAD: Adaptive Data Mixing for Offline-to-Online Reinforcement Learning via Bi-Level Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:35:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ROAD formulates data mixing as a bi-level optimization problem solved via multi-armed bandit to adaptively balance offline priors and online updates in RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14350","ref_index":160,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Distributionally Robust Multi-Task Reinforcement Learning via Adaptive Task Sampling","primary_cat":"cs.LG","submitted_at":"2026-05-14T04:22:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DRATS derives a minimax objective from a feasibility formulation of MTRL to adaptively sample tasks with the largest return gaps, leading to better worst-task performance on MetaWorld benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13435","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Q-Flow: Stable and Expressive Reinforcement Learning with Flow-Based Policy","primary_cat":"cs.LG","submitted_at":"2026-05-13T12:31:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12416","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Aligning Flow Map Policies with Optimal Q-Guidance","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Flow map policies enable fast one-step inference for flow-based RL policies, and FMQ provides an optimal closed-form Q-guided target for offline-to-online adaptation under trust-region constraints, achieving SOTA performance.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Let us examine the terms inside the squared norm and remove the stop-gradient operator sg. We compute the partial derivative with respect to the start timer: ∂rXr,t(ar |s) =−u θ r,t(ar |s) + (t−r)∂ ruθ r,t(ar |s).(28) Plugging this back into the Eulerian objective, we have, L=E   −uθ r,t(ar |s) | {z } T1 + (t−r)∂ ruθ r,t(ar |s) +∇X r,t(ar |s)u θ r,r(ar |s) | {z } T2 2  .(29) Applying a stop-gradient toT 2 and taking parameter gradients, Plugging this back into the Eulerian objective, we have, ∇θL(θ) = 2E \u0002 uθ r,t(ar |s)− ∇ θuθ r,t(ar |s)· uθ r,t(ar |s) +sg(T 2) \u0001\u0003 .(30) Now expanding the spatial gradient term inT 2, that is∇X r,t(ar |s)u θ r,r(ar |s): ∇Xr,t(ar |s)u θ r,r(ar |s) =u θ r,r(ar |s) + (t−r)∇u θ r,t(ar |s)u θ"},{"citing_arxiv_id":"2605.12379","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Discrete Flow Matching for Offline-to-Online Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-12T16:44:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT enables stable offline-to-online fine-tuning of CTMC policies in discrete RL via advantage-weighted discrete flow matching, path-space regularization, and candidate-set approximation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11387","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Behavioral Mode Discovery for Fine-tuning Multimodal Generative Policies","primary_cat":"cs.LG","submitted_at":"2026-05-12T01:19:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Unsupervised behavioral mode discovery combined with mutual information rewards enables RL fine-tuning of multimodal generative policies that achieves higher success rates without losing action diversity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11151","ref_index":12,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RankQ: Offline-to-Online Reinforcement Learning via Self-Supervised Action Ranking","primary_cat":"cs.AI","submitted_at":"2026-05-11T18:58:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RankQ augments temporal-difference Q-learning with a multi-term self-supervised ranking loss to enforce structured action ordering, yielding competitive or better results than prior methods on D4RL and large gains in vision-based robot fine-tuning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"A large class of offline RL methods mitigates extrapolation error by constraining policies toward the support of the offline dataset. Methods such as BCQ [8], BEAR [9], and BRAC [10] explicitly constrain policy updates toward dataset actions. Subsequent methods instead combine RL objectives with behavior cloning regularization. AW AC [11], TD3+BC [12], and ReBRAC [13] stabilize learning through various forms of behavior regularization, while IQL [14] avoids explicit optimization over OOD actions using expectile regression and advantage-weighted updates [ 15]. While originally proposed for offline RL, many of these methods are also commonly used to initialize policies for subsequent online fine-tuning."},{"citing_arxiv_id":"2605.10734","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"XQCfD: Accelerating Fast Actor-Critic Algorithms with Prior Data and Prior Policies","primary_cat":"cs.LG","submitted_at":"2026-05-11T15:38:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"XQCfD accelerates actor-critic RL by using prior data, pretrained policies, and stationary architectures to achieve state-of-the-art results on Adroit, Robomimic, and MimicGen manipulation benchmarks with low update-to-data ratios.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"REDQ [6] as an effective RL algorithm in this setting. However, by not utilizing BC initialization effectively, buffer-based methods forfeit potentially significant sample efficiency gains, which we show in our experimental results. Offline-to-online reinforcement learning.A related setting where the prior data is suboptimal considers continuing offline RL with online interactions and learning [21]. This setting is challenging due to the difficulty of performing continual learning with function approximation with severe performance degradation, characterized as 'catastrophic forgetting' or a lack of network 'plasticity'. Solutions include BC regularization [44], careful sampling of offline and online samples from the 2Multiplexing refers to combining samples from several replay buffers within a single minibatch."},{"citing_arxiv_id":"2605.10289","ref_index":57,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Mean Anchored Thompson Sampling for Offline-to-Online Learning with Distribution Shift","primary_cat":"cs.LG","submitted_at":"2026-05-11T09:50:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Anchor-TS defines arm indices as the median of an online posterior sample, a hybrid posterior sample, and the online sample mean to correct distribution-shift bias and safely accelerate online learning with offline data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11009","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ACSAC: Adaptive Chunk Size Actor-Critic with Causal Transformer Q-Network","primary_cat":"cs.LG","submitted_at":"2026-05-10T10:00:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ACSAC adaptively selects action chunk sizes via a causal Transformer Q-network in actor-critic RL, proves the Bellman operator is a contraction, and reports state-of-the-art results on long-horizon manipulation tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"continuous action chunks. In the offline or offline-to-online setting, Q-chunking [ 21] runs RL at an action chunk level with a flow BC policy and rejection sampling. DQC [ 22] decouples the policy chunk size from the critic chunk size, with the policy predicting a shorter action chunk while retaining the value learning benefits of the chunked critic. MAC [ 31] combines an action-chunk dynamics model with rejection sampling from an expressive flow BC policy. DEAS [16] leverages action sequences for training critics with detached value learning and classification loss. CGQ [36] regularizes a single-step critic toward a chunked critic. All of the above RL methods rely on a fixed chunk size across states and tasks."},{"citing_arxiv_id":"2605.07727","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Drifting Field Policy: A One-Step Generative Policy via Wasserstein Gradient Flow","primary_cat":"cs.LG","submitted_at":"2026-05-08T13:34:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DFP is a one-step generative policy using Wasserstein gradient flow on a drifting model backbone, with a top-K behavior cloning surrogate, that reaches SOTA on Robomimic and OGBench manipulation tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"uniquely benefits the drifting backbone owing to its non-ODE parameterization. With one-step inference, DFP achieves state-of-the-art performance on several manipulation tasks across Robomimic and OGBench, outperforming ODE-based policies. 1 Introduction Offline-to-online reinforcement learning (RL) has emerged as a practical paradigm for continuous control [42, 30, 43, 3], where an RL agent is first pretrained on static demonstrations and then refined through online interaction. To faithfully capture the multimodal action distributions of real- world demonstrations beyond unimodal Gaussians, the field has increasingly turned to generative policies, with ODE-based backbones such as diffusion and flow policies [62, 21, 11, 45, 12] proving"},{"citing_arxiv_id":"2605.06529","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Market-Alignment Risk in Pricing Agents: Trace Diagnostics and Trace-Prior RL under Hidden Competitor State","primary_cat":"cs.AI","submitted_at":"2026-05-07T16:31:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In a hotel revenue-management simulator, standard RL agents game scalar RevPAR rewards under hidden competitor states, but Trace-Prior RL matches both revenue metrics and price distributions by training a stochastic policy with a KL penalty to a learned market prior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05863","ref_index":11,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SOPE: Stabilizing Off-Policy Evaluation for Online RL with Prior Data","primary_cat":"cs.LG","submitted_at":"2026-05-07T08:32:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOPE dynamically controls offline training length in online RL using actor-aligned OPE on validation data to stop when benefits saturate, achieving up to 45.6% better performance and 22x less computation on Minari tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05544","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Q-Chunking for Offline-to-Online Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-07T00:48:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adaptive Q-Chunking selects optimal action chunk sizes at each state via normalized advantage comparisons to outperform fixed chunk sizes in offline-to-online RL on robot benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05123","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Policy Selection and Fine-Tuning under Interaction Budgets for Offline-to-Online Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-06T16:51:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An adaptive UCB-based policy selection and fine-tuning strategy improves performance over standard O2O-RL baselines under interaction budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02469","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reference-Sampled Boltzmann Projection for KL-Regularized RLVR: Target-Matched Weighted SFT, Finite One-Shot Gaps, and Policy Mirror Descent","primary_cat":"cs.LG","submitted_at":"2026-05-04T11:10:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reference-sampled weighted SFT with prompt-normalized Boltzmann weights induces the same policy as fixed-reference KL-regularized RLVR, with BOLT as the estimator and a finite one-shot error decomposition separating coverage, variance, and other terms.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ReST reuse generated reasoning traces through self-training or reward filtering [ 13, 53], scaling studies train on generated mathematical reasoning data [50], and offline RL studies fixed logged data under distribution shift through conservative value learning or sequence modeling [3, 25]. Reward- augmented likelihood, advantage-weighted regression, and AW AC turn rewards or advantages into weighted regression objectives [31, 32, 35]. Preference objectives such as DPO, KTO, ORPO, and SimPO derive supervised or reference-free losses from preference modeling choices [11, 18, 29, 38]. Recent weighted-SFT variants choose reward-, policy-, or data-ratio weights, including V AR, SPR, Refit, and IWSFT [10, 30, 37, 57]. BOLT is closest to this static family, but its defining constraint"},{"citing_arxiv_id":"2605.01968","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AdamO: A Collapse-Suppressed Optimizer for Offline RL","primary_cat":"cs.LG","submitted_at":"2026-05-03T16:53:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdamO modifies Adam with an orthogonality correction to ensure the spectral radius of the TD update operator stays below one, providing a theoretical stability guarantee for offline RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01862","ref_index":119,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"QHyer: Q-conditioned Hybrid Attention-mamba Transformer for Offline Goal-conditioned RL","primary_cat":"cs.LG","submitted_at":"2026-05-03T13:11:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QHyer replaces return-to-go with a state-conditioned Q-estimator and adds a gated hybrid attention-mamba backbone to achieve state-of-the-art performance in offline goal-conditioned RL on both Markovian and non-Markovian datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00416","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning While Deploying: Fleet-Scale Reinforcement Learning for Generalist Robot Policies","primary_cat":"cs.RO","submitted_at":"2026-05-01T05:20:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Bellemare, \"Reincarnating reinforcement learn- ing: Reusing prior computation to accelerate progress,\" Advances in neural information processing systems, vol. 35, pp. 28 955-28 971, 2022. [42] P. J. Ball, L. Smith, I. Kostrikov, and S. Levine, \"Effi- cient online reinforcement learning with offline data,\" in International Conference on Machine Learning. PMLR, 2023, pp. 1577-1594. [43] A. Nair, A. Gupta, M. Dalal, and S. Levine, \"Awac: Accelerating online reinforcement learning with offline datasets,\"arXiv preprint arXiv:2006.09359, 2020. [44] Y . Song, Y . Zhou, A. Sekhari, J. A. Bagnell, A. Krish- namurthy, and W. Sun, \"Hybrid rl: Using both offline and online data can make rl efficient,\"arXiv preprint arXiv:2210.06718, 2022."},{"citing_arxiv_id":"2604.22873","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Policies Cannot Be Retrained: A Unified Closed-Form View of Post-Training Steering in Offline Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-23T20:20:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"For diagonal-Gaussian frozen actors, PoE with alpha equals KL adaptation with beta = alpha/(1-alpha); empirically, composition shows an actor-competence ceiling with 4/5/3 HELP/FROZEN/HURT split on D4RL and zero success on AntMaze.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17919","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Fisher Decorator: Refining Flow Policy via a Local Transport Map","primary_cat":"cs.LG","submitted_at":"2026-04-20T07:54:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fisher Decorator refines flow policies in offline RL via a local transport map and Fisher-matrix quadratic approximation of the KL constraint, yielding controllable error near the optimum and SOTA benchmark results.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Baselines for Offline Setting.To ensure a comprehensive evaluation, we compare our method against eleven recent offline RL al- gorithms spanning a broad range of architec- tural designs and policy extraction paradigms. These include: (1) Gaussian policies: BC, IQL [40], and ReBRAC [37]; (2) diffusion policies: IDQL [6], SRPO [7], and CAC [14]; and (3) flow policies: FAWAC [41], IFQL [9], FQL [19], and DeFlow [23].Baselines for Offline-to-Online Setting.For offline-to-online RL experi- ments, we consider prior offline RL methods (IQL, ReBRAC, IFQL, FQL and DeFlow) that support 7 fine-tuning and achieve strong performance. Meanwhile, we consider two other strong algorithms designed for online fine-tuning: Cal-QL [42] and RLPD [43]."},{"citing_arxiv_id":"2604.14895","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Importance Sampling: Rejection-Gated Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-16T11:39:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RGPO replaces importance sampling with a smooth [0,1] acceptance gate in policy gradients, unifying TRPO/PPO/REINFORCE, bounding variance for heavy-tailed ratios, and showing gains in online RLHF experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14333","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Missing Becomes Structure: Intent-Preserving Policy Completion from Financial KOL Discourse","primary_cat":"cs.LG","submitted_at":"2026-04-15T18:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KICL completes execution decisions in KOL financial discourse using offline RL, achieving top returns and Sharpe ratios with no unsupported trades or direction changes on YouTube and X data from 2022-2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10165","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MoRI: Mixture of RL and IL Experts for Long-Horizon Manipulation Tasks","primary_cat":"cs.RO","submitted_at":"2026-04-11T11:24:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoRI dynamically mixes RL and IL experts with variance-based switching and IL regularization to reach 97.5% success in four real-world robotic tasks while cutting human intervention by 85.8%.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Other studies extend consistent offline-to-online fine-tuning to Vision-Language-Action (VLA) models and real-world RL [12], or employ residual RL frameworks built upon a Behavioral Cloning (BC) base policy [18] to facilitate learning on physical robots. Despite these advances, these approaches are often not fully validated on long-horizon tasks. While RL is frequently used for post-training VLA models [19], [20] to boost performance, the high compu- tational cost typically hinders practical deployment. Recent efforts have also integrated iterative offline and online RL with diffusion policies [9] to reduce human intervention, yet the convergence time (e.g., 14 hours per task) remains a significant bottleneck. There is still a need for an efficient framework that combines IL and RL to handle long-horizon"},{"citing_arxiv_id":"2604.08958","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WOMBET: World Model-Based Experience Transfer for Robust and Sample-efficient Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-10T04:57:54+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08174","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Value-Guidance MeanFlow for Offline Multi-Agent Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-09T12:31:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VGM²P achieves SOTA-comparable performance in offline MARL via value-guided conditional behavior cloning with MeanFlow, enabling efficient single-step action generation insensitive to regularization coefficients.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08036","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PriPG-RL: Privileged Planner-Guided Reinforcement Learning for Partially Observable Systems with Anytime-Feasible MPC","primary_cat":"cs.LG","submitted_at":"2026-04-09T09:41:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PriPG-RL trains RL policies for POMDPs by distilling knowledge from a privileged anytime-feasible MPC planner into a P2P-SAC policy, improving sample efficiency and performance in partially observable robotic navigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.22801","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unleashing the Potential of Diffusion Models for End-to-End Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-02-26T09:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces Hyper Diffusion Planner (HDP), a diffusion-based E2E AD framework that identifies insights on loss space, trajectory representation and data scaling, adds RL post-training, and reports 10x performance gains over 200 km of real-world testing across 6 scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.09580","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SERNF: Sample-Efficient Real-World Dexterous Policy Fine-Tuning via Action-Chunked Critics and Normalizing Flows","primary_cat":"cs.RO","submitted_at":"2026-02-10T09:28:20+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.18662","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pseudo-Expert Regularized Offline RL for End-to-End Autonomous Driving in Photorealistic Closed-Loop Environments","primary_cat":"cs.RO","submitted_at":"2025-12-21T09:21:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pseudo-expert regularized offline RL reduces collisions and improves route completion for camera-based driving models trained on fixed simulator datasets from nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.03828","ref_index":9,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Static Constraints to Dynamic Adaptation: Sample-Level Constraint Relaxation for Offline-to-Online Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2025-11-05T19:48:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DARE performs sample-level constraint relaxation in offline-to-online RL by conditioning on behavioral consistency with a behavior model via posterior-induced exchange, yielding improved fine-tuning stability and performance on D4RL benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.16615","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM-Guided Task- and Affordance-Level Exploration in Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2025-09-20T10:37:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM-TALE steers RL exploration using LLM-generated plans at task and affordance levels with online suboptimality correction, improving sample efficiency and success rates on pick-and-place tasks without human supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.07969","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcement Learning with Action Chunking","primary_cat":"cs.LG","submitted_at":"2025-07-10T17:48:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Q-chunking improves offline-to-online RL sample efficiency on long-horizon sparse-reward manipulation tasks by applying action chunking to TD learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.00480","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Posterior Inference in Latent Space for Scalable Constrained Black-box Optimization","primary_cat":"cs.LG","submitted_at":"2025-07-01T06:55:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reformulates constrained black-box optimization as posterior inference in latent space of flow-based models amortized by outsourced diffusion models, claiming superior performance on synthetic and real tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.15799","ref_index":88,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Steering Your Diffusion Policy with Latent Space Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2025-06-18T18:35:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DSRL steers pretrained diffusion policies for robotics by applying RL to their latent noise inputs, achieving sample-efficient real-world adaptation with only black-box access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.05762","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BiTrajDiff: Bidirectional Trajectory Generation with Diffusion Models for Offline Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2025-06-06T05:41:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BiTrajDiff augments offline RL datasets by running independent forward and backward diffusion processes from intermediate states, yielding higher performance than prior one-directional data-augmentation baselines on D4RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.18719","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLA-RL: Towards Masterful and General Robotic Manipulation with Scalable Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2025-05-24T14:42:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLA-RL applies online RL to pretrained VLAs, yielding a 4.5% gain over strong baselines on 40 LIBERO manipulation tasks and matching commercial models like π₀-FAST.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.13934","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COLSON: Controllable Learning-Based Social Navigation via Diffusion-Based Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2025-03-18T06:02:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COLSON applies diffusion models to reinforcement learning for social robot navigation and adds controllability mechanisms that enable zero-shot adaptation to unseen static obstacles and altered objectives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2409.00588","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diffusion Policy Policy Optimization","primary_cat":"cs.RO","submitted_at":"2024-09-01T02:47:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DPPO fine-tunes diffusion policies via policy gradients and outperforms prior RL approaches for diffusion policies and PG-tuned alternatives on robot benchmarks while enabling stable training and hardware deployment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.13301","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training Diffusion Models with Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2023-05-22T17:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DDPO uses policy gradients on the denoising process to optimize diffusion models for arbitrary rewards like human feedback or compressibility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2304.10573","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IDQL: Implicit Q-Learning as an Actor-Critic Method with Diffusion Policies","primary_cat":"cs.LG","submitted_at":"2023-04-20T18:04:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IDQL generalizes IQL into an actor-critic framework and uses diffusion policies for robust policy extraction, outperforming prior offline RL methods.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"the critic training is completely separated from this model, leading to IDQL's computational efﬁciency. 6.2 Online Finetuning After ofﬂine training, policies can be improved with online interactions. We test the procedure of freezing the behavior policy and ﬁnetuning the value networks only, as well as ﬁnetuning all networks, as described in Section 4.2. We compare to current state-of-the-art ﬁnetuning methods: Cal-QL [34], RLPD [3], and IQL [ 26]. Results are presented in Table 3. We see large improvement in both pre-training and ﬁnal ﬁne-tuning performance compared to IQL. IDQL also remains competitive with RLPD and Cal-QL in ﬁnetuning, while having stronger pre-training results. Most of the gains come from improvements in the hardest antmaze-large environments. 6."},{"citing_arxiv_id":"2211.15657","ref_index":188,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Is Conditional Generative Modeling all you need for Decision-Making?","primary_cat":"cs.LG","submitted_at":"2022-11-28T18:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Return-conditional diffusion models for policies outperform offline RL on benchmarks by circumventing dynamic programming and enable constraint or skill composition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2208.06193","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diffusion Policies as an Expressive Policy Class for Offline Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2022-08-12T09:54:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Diffusion-QL uses conditional diffusion models as expressive policies in offline RL by coupling behavior cloning with Q-value maximization, achieving SOTA on most D4RL tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2106.01345","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decision Transformer: Reinforcement Learning via Sequence Modeling","primary_cat":"cs.LG","submitted_at":"2021-06-02T17:53:39+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Decision Transformer casts RL as autoregressive sequence modeling conditioned on desired returns, past states and actions, matching or exceeding offline RL baselines on Atari, Gym and Key-to-Door tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}