{"total":70,"items":[{"citing_arxiv_id":"2606.27939","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Two-Stage Fine-Tuning for Protein Sequence Generation with Targeted Amino-Acid Composition","primary_cat":"cs.LG","submitted_at":"2026-06-26T10:29:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A domain-adaptive fine-tuning stage followed by reward-weighted RL fine-tuning produces protein sequences whose amino-acid composition matches a specified target while preserving sequence statistics and diversity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26006","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FORCE: Efficient VLA Reinforcement Fine-Tuning via Value-Calibrated Warm-up and Self-Distillation","primary_cat":"cs.RO","submitted_at":"2026-06-24T16:23:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FORCE is a 3-stage RL fine-tuning method for VLA models that stabilizes Q-function via on-policy warm-up and filters high-value actions for updates, claiming 79% success rate gains and 32.5% faster training without human intervention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08414","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PACT: Self-Evolving Physical Safety Alignment for Diffusion Policies in Embodied Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-07T02:27:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PACT is a self-evolving post-training framework that projects diffusion policies onto constraint-feasible regions via reverse-KL distillation and a tightening curriculum, reporting 31% fewer safety violations and 30.7% higher task success on embodied manipulation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31494","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consolidating Rewarded Perturbations for LLM Post-Training","primary_cat":"cs.CL","submitted_at":"2026-05-29T16:16:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoRP consolidates reward-weighted perturbations into a single model via low-rank structure, improving base LLMs by 8.1 points on average while using one-tenth the budget of prior ensembles and one forward pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31455","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Decoupled Rollouts and Importance-Weighted Fine-Tuning for Efficient Multi-Turn Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T15:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT achieves multi-turn RL performance via offline importance-weighted SFT by leveraging the equivalence of KL-regularized RL to weighted supervised learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30749","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FLAG: Flow Policy MaxEnt-RL by Latent Augmented Guidance","primary_cat":"cs.LG","submitted_at":"2026-05-29T02:25:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FLAG augments state space with flow latent variable to optimize a proxy MaxEnt-RL objective, enabling expressive policies with limited importance samples in high-dimensional control.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23551","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Goal-Conditioned Agents that Learn Everything All at Once","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:17:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LEO enables efficient all-goals learning in goal-conditioned RL by jointly predicting for all goals in one network pass, yielding >250x speedup over relabelling and better performance on Craftax.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22711","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Abstraction for Offline Goal-Conditioned Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-21T16:50:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces relativised options and hierarchical abstraction to reuse experience across similar contexts in offline GCRL, with two algorithms demonstrating performance gains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21195","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RankE: End-to-End Post-Training for Discrete Text-to-Image Generation with Decoder Co-Evolution","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:56:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RankE co-evolves AR policy and decoder via alternating ranking optimization, improving both FID and CLIP scores on LlamaGen-XL and Janus-Pro where policy-only RL degrades FID.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20609","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Compositional Transduction with Latent Analogies for Offline Goal-Conditioned Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-20T01:54:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes latent analogies and analogy transduction to enable compositional generalization to unseen goal-context pairs in offline GCRL, outperforming trajectory-stitching baselines on manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20555","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Complementing reinforcement learning with SFT through logit averaging in the post training of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-19T23:15:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Logit averaging inside GRPO yields higher or comparable benchmark accuracy to KL-regularized GRPO without using KL terms or a critic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20506","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcing Human Behavior Simulation via Verbal Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-19T21:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DITTO uses RL with verbal feedback to train LLMs for human behavior simulation, reporting 36% average gains over base models and outperforming GPT-5.4 on 6 of 10 SOUL benchmark tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18675","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COOPO: Cyclic Offline-Online Policy Optimization Algorithm","primary_cat":"cs.LG","submitted_at":"2026-05-18T17:15:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COOPO is a cyclic offline-online RL algorithm that repeatedly anchors the policy to a dataset via KL-regularized updates then fine-tunes online, claiming better sample efficiency and monotonic improvement under coverage assumptions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18320","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ISEP: Implicit Support Expansion for Offline Reinforcement Learning via Stochastic Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-18T12:39:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ISEP expands action support in offline RL via value interpolation between data and policy samples, then uses stochastic policy optimization to avoid mode collapse in the resulting multimodal objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15603","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Offline Reinforcement Learning with Universal Horizon Models","primary_cat":"cs.LG","submitted_at":"2026-05-15T04:30:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Universal horizon models extend geometric horizon models to arbitrary horizons and apply winsorized distributions for stable offline RL value learning, outperforming baselines on 100 OGBench tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15113","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from Language Feedback via Variational Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:27:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VPD frames language feedback learning as variational EM so the teacher policy refines itself via trust-region updates on outcomes while the student learns dense token distributions on its own rollouts, outperforming fixed-teacher baselines on reasoning and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13435","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Q-Flow: Stable and Expressive Reinforcement Learning with Flow-Based Policy","primary_cat":"cs.LG","submitted_at":"2026-05-13T12:31:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13207","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Switching Successor Measures for Hierarchical Zero-shot Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-13T08:58:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Switching successor measures extend classical successor measures to enable hierarchical zero-shot RL via the FB π-Switch algorithm that extracts subgoal-selection and control policies from forward-backward representations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11726","ref_index":42,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Block-R1: Rethinking the Role of Block Size in Multi-domain Reinforcement Learning for Diffusion Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-12T08:09:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces Block-R1 benchmark, Block-R1-41K dataset, and a conflict score to handle domain-specific optimal block sizes in RL post-training of diffusion LLMs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Principled RL for Diffusion LLMs Emerges from a Sequence-Level Perspective. InICLR, 2026. [40] J. Pan, J. Zhang, X. Wang, L. Yuan, H. Peng, and A. Suhr. Tinyzero.GitHub repository https://github.com/Jiayi-Pan/TinyZero, 2025. [41] V . M. Panaretos and Y . Zemel. Statistical aspects of wasserstein distances.Annual Review of Statistics and Its Application, 2019. [42] X. B. Peng, A. Kumar, G. Zhang, and S. Levine. Advantage-Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning.arXiv preprint arXiv:1910.00177, 2019. [43] R. Rafailov, A. Sharma, E. Mitchell, S. Ermon, C. D. Manning, and C. Finn. Direct Pref- erence Optimization: Your Language Model is Secretly a Reward Model.arXiv preprint arXiv:2305."},{"citing_arxiv_id":"2605.11501","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decaf: Improving Neural Decompilation with Automatic Feedback and Search","primary_cat":"cs.SE","submitted_at":"2026-05-12T04:21:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decaf uses compiler feedback and search to improve neural decompilation, boosting semantic success rate from 26.0% to 83.9% on ExeBench Real -O2 split.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"compiler configurations and instruction set architectures is an important direction for future work if the reranker is intended to generalize to these domains. Reinforcement Learning and Iterative Refinement . As described in Section 2.2, we sample eight candidates per function during data collection, yielding positive and neg- ative sequences that could be used for offline RL [32], [33], Direct Preference Optimization [34], or online RL algorithms such as PPO [35], [36] or GRPO [37]. In earlier iterations we also explored an iterative refinement model to recursively edit generator outputs, but did not find significant improvements. 5. Related Work 5.1. Traditional Decompilation Decompilation has been studied for over five decades [15], but most modern systems trace their lineage"},{"citing_arxiv_id":"2605.11387","ref_index":65,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Behavioral Mode Discovery for Fine-tuning Multimodal Generative Policies","primary_cat":"cs.LG","submitted_at":"2026-05-12T01:19:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Unsupervised behavioral mode discovery combined with mutual information rewards enables RL fine-tuning of multimodal generative policies that achieves higher success rates without losing action diversity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10654","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Active Learning for Gaussian Process Regression Under Self-Induced Boltzmann Weights","primary_cat":"cs.LG","submitted_at":"2026-05-11T14:38:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AB-SID-iVAR enables Gaussian process active learning for self-induced Boltzmann distributions by closed-form approximation of the target, with high-probability error vanishing guarantees and empirical gains on PES and drug discovery tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07727","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Drifting Field Policy: A One-Step Generative Policy via Wasserstein Gradient Flow","primary_cat":"cs.LG","submitted_at":"2026-05-08T13:34:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DFP is a one-step generative policy using Wasserstein gradient flow on a drifting model backbone, with a top-K behavior cloning surrogate, that reaches SOTA on Robomimic and OGBench manipulation tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Zhai, A. Singh, M. Sobol Mark, Y . Ma, C. Finn, A. Kumar, and S. Levine. Cal-ql: Calibrated offline rl pre-training for efficient online fine-tuning. InNeurIPS, 2023. [44] S. Park, K. Frans, B. Eysenbach, and S. Levine. Ogbench: Benchmarking offline goal- conditioned rl. InICLR, 2025. [45] S. Park, Q. Li, and S. Levine. Flow q-learning. InICML, 2025. [46] X. B. Peng, A. Kumar, G. Zhang, and S. Levine. Advantage-weighted regression: Simple and scalable off-policy reinforcement learning.arXiv preprint arXiv:1910.00177, 2019. [47] J. Peters, K. Mulling, and Y . Altun. Relative entropy policy search. InAAAI, 2010. [48] A. Prasad, K. Lin, J. Wu, L. Zhou, and J. Bohg. Consistency policy: Accelerated visuomotor"},{"citing_arxiv_id":"2605.07545","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Implicit Preference Alignment for Human Image Animation","primary_cat":"cs.CV","submitted_at":"2026-05-08T10:19:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IPA aligns animation models for superior hand quality via implicit reward maximization on self-generated samples plus hand-focused local optimization, avoiding expensive paired data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06156","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Entropy-Regularized Adjoint Matching for Offline Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-07T12:47:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ME-AM adds mirror-descent entropy maximization and a mixture behavior prior to adjoint matching in flow-based policies to mitigate popularity bias and support binding in offline RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06139","ref_index":44,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Listwise Policy Optimization: Group-based RLVR as Target-Projection on the LLM Response Simplex","primary_cat":"cs.LG","submitted_at":"2026-05-07T12:38:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Listwise Policy Optimization explicitly performs target-projection on the LLM response simplex, unifying and improving group-based RLVR methods with monotonic improvement and flexible divergences.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"exact proximal updates πt+1(y)∝π t(y)exp(R(y)/τ) , the iteration satisfies πt(y)∝π 0(y)exp(tR(y)/τ) and Eπt [R]→max y R(y)ast→∞. Proof. By induction: the base case t=0 is trivial. If πt(y)∝π 0(y)exp(tR(y)/τ) , then πt+1(y)∝ πt(y)exp(R(y)/τ)∝π 0(y)exp((t+1)R(y)/τ). For convergence, consider any two responsesy 1,y 2 withR(y 1)>R(y 2): πt(y1) πt(y2) = π0(y1) π0(y2) exp t· R(y1)−R(y 2) τ \u0001 →∞.(44) Sinceπ 0(y)>0for ally, the mass concentrates onarg max y R(y), givingE πt [R]→max y R(y). Connecting global optimality to LPO.Proposition 2 characterizes the ideal full-space proximal operator: if one could exactly apply the Gibbs update over the entire response space, the resulting iteration converges to the global RL optimum. For autoregressive LLMs, however, the required partition function is intractable over the"},{"citing_arxiv_id":"2605.04653","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Threshold-Guided Optimization for Visual Generative Models","primary_cat":"cs.LG","submitted_at":"2026-05-06T08:59:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A threshold-guided alignment method lets visual generative models be optimized directly from scalar human ratings instead of requiring paired preference data.","context_count":1,"top_context_role":"method","top_context_polarity":"unclear","context_text":"∇3Ln(¯θ)[∆,∆] +o p(n−1). (20) Step 3: Take expectations.Since E[∇Ln(θ∗)] = 0 and ∇2Ln(θ∗) p →H , we can replace the empirical Hessian and third derivatives by their population counterpartsHandJup too(n −1)terms: E[∆] =− 1 2 H −1JE[∆⊗∆] +o(n −1).(21) Step 4: Insert asymptotic covariance.From standard M-estimator theory, E[∆⊗∆] = 1 n H −1SH −1 +o(n −1).(22) Substituting Eq. 22 into Eq. 21 gives E[∆] =− 1 2n H −1J H −1SH −1\u0001 +o(n −1), which matches Eq. 18. Remark.If ℓ(θ;z) is the negative log-likelihood of a correctly specified model, then S=H=I(θ ∗) (the Fisher information), which further simplifies the bias term. B.4. Calibration of Threshold-Guided Pseudo-Labels Proposition B.4(Calibration of Threshold-Guided Pseudo-Labels)."},{"citing_arxiv_id":"2605.03065","ref_index":167,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OGPO: Sample Efficient Full-Finetuning of Generative Control Policies","primary_cat":"cs.LG","submitted_at":"2026-05-04T18:36:40+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02469","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reference-Sampled Boltzmann Projection for KL-Regularized RLVR: Target-Matched Weighted SFT, Finite One-Shot Gaps, and Policy Mirror Descent","primary_cat":"cs.LG","submitted_at":"2026-05-04T11:10:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reference-sampled weighted SFT with prompt-normalized Boltzmann weights induces the same policy as fixed-reference KL-regularized RLVR, with BOLT as the estimator and a finite one-shot error decomposition separating coverage, variance, and other terms.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"rollout data inside the optimization loop. A natural response is to decouple generation from optimization: sample verified rollouts once, attach weights, and train with supervised infrastructure. Reward-augmented likelihood [ 32], advantage- Preprint. Correspondence to Yao Shu<yaoshu@hkust-gz.edu.cn>. arXiv:2605.02469v1 [cs.LG] 4 May 2026 weighted regression [ 35], STaR [ 53], ReST [ 13], offline RL on fixed logged data [ 3, 25], and DPO-style supervised reductions of KL-regularized objectives [38] all show that weighted or static likelihood can be a powerful training form. They also expose the ambiguity that matters for RLVR replacement. Weighted SFT is not one objective: the sampler decides which rollouts can appear,"},{"citing_arxiv_id":"2605.01968","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AdamO: A Collapse-Suppressed Optimizer for Offline RL","primary_cat":"cs.LG","submitted_at":"2026-05-03T16:53:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdamO modifies Adam with an orthogonality correction to ensure the spectral radius of the TD update operator stays below one, providing a theoretical stability guarantee for offline RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01862","ref_index":145,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"QHyer: Q-conditioned Hybrid Attention-mamba Transformer for Offline Goal-conditioned RL","primary_cat":"cs.LG","submitted_at":"2026-05-03T13:11:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"QHyer replaces return-to-go with a state-conditioned Q-estimator and adds a gated hybrid attention-mamba backbone to achieve state-of-the-art performance in offline goal-conditioned RL on both Markovian and non-Markovian datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01663","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Efficient and Expressive Offline RL via Flow-Anchored Noise-conditioned Q-Learning","primary_cat":"cs.LG","submitted_at":"2026-05-03T01:32:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"for any s∈ S , a∈ A , ϵ′ ∈R d, and Q∈ Q , Q(s, a, ϵ′) converges to Qπ(s, a, ϵ′) if we iterate T π n over this value. (2) Upper Expectile and the Essential Supremum.We now show why our TD learning objective (Eq.(10)) recovers the return distribution converged throughT π n (Eq.(9)). Theorem 4.2(Upper Expectile Converges to the Essential Supremum).Let s∈ S , a∈ A , ϵ∼ N(0, I d), and Q∈ Q . For any κ∈[ 1 2 ,1) , Zκ := arg minq∈R Eϵ \u0002 Lκ 2(Q(s, a, ϵ)−q) \u0003 is bounded by: Z1/2 ≤Z κ ≤lim κ→1− Zκ = ess supϵ Q(s, a, ϵ). (16) Proof. Please refer to the proof stated in Appendix B.2. This implies that the upper expectile Zψ trained through Eq.(11) withκ≈1converges toess supQ ϕ. (3) Validity of Behavior Regularization.We show that minimizing LB (Eq.(14)) controls the deviation between distributions induced by the one-step policy πω and the behavior policyv θ modeling the offline dataset behavior. Theorem 4.3(Flow Anchoring is a Valid Behav- ior Regularization).Let µω(·|s) and µθ(·|s) be the probability distributions induced by the policy πω and the behavior flow vθ respectively (Defini- tion B.5). If vθ satisfies Lipschitzness (Assump- tion B.6), the following holds for alls∈ S: Es∼D h W 2 2 (µω(·|s), µ θ(·|s)) i ≤e 2L LB(ω), (17) where W2 is the Wasserstein-2 distance andL is the Lipschitz constant. Proof. We provide the complete derivation in Appendix B.3. The equality holds when µω(·|s) =µ θ(·|s) and all flow tra- jectories of the vector field vθ are straight. We note that our behavior model vθ is parameterized by standard neural net- works which are Lipschitz, also with Lipschitz-continuous activation functions (e.g., GeLU). Since the composition of Lipschitz functions are Lipschitz, Assumption B.6 is always satisfied. Consequently, minimizing LB (Eq. (14)) directly minimizes the upper bound on the Wasserstein-2 distance between the distributions induced by the training policy πω and the behavior flow policyv θ. 5. Experiments In this section, we demonstrate that FA"},{"citing_arxiv_id":"2605.00416","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning While Deploying: Fleet-Scale Reinforcement Learning for Generalist Robot Policies","primary_cat":"cs.RO","submitted_at":"2026-05-01T05:20:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"as stop-gradient when computing the TD target. B. Policy Extraction via QAM Policy extraction in LWD starts from a pretrained flow- matching VLA and aims to improve its action distribution using the DIVL critic. Existing offline RL methods often extract a policy without differentiating throughQ ϕ, for exam- ple by advantage-weighted regression on replay actions [56, 43, 23, 57]. This update is poorly matched to flow-based VLA policies, since it requires evaluating the log likelihood of action chunks under the multi-step denoising process of the flow policy. More generally, the KL-regularized policy improvement target has a Boltzmann form, whose normalizer requires integrating over high-dimensional action chunks."},{"citing_arxiv_id":"2604.27955","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GUI Agents with Reinforcement Learning: Toward Digital Inhabitants","primary_cat":"cs.AI","submitted_at":"2026-04-30T14:51:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper delivers the first comprehensive overview of RL for GUI agents, organizing methods into offline, online, and hybrid strategies while analyzing trends in rewards, efficiency, and deliberation to outline a future roadmap.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22873","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Policies Cannot Be Retrained: A Unified Closed-Form View of Post-Training Steering in Offline Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-23T20:20:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"For diagonal-Gaussian frozen actors, PoE with alpha equals KL adaptation with beta = alpha/(1-alpha); empirically, composition shows an actor-competence ceiling with 4/5/3 HELP/FROZEN/HURT split on D4RL and zero success on AntMaze.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17551","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SVL: Goal-Conditioned Reinforcement Learning as Survival Learning","primary_cat":"cs.LG","submitted_at":"2026-04-19T17:44:13+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14895","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Importance Sampling: Rejection-Gated Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-16T11:39:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RGPO replaces importance sampling with a smooth [0,1] acceptance gate in policy gradients, unifying TRPO/PPO/REINFORCE, bounding variance for heavy-tailed ratios, and showing gains in online RLHF experiments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12509","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Whole-Body Mobile Manipulation using Offline Reinforcement Learning on Sub-optimal Controllers","primary_cat":"cs.RO","submitted_at":"2026-04-14T09:32:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WHOLE-MoMa improves whole-body mobile manipulation by applying offline RL with Q-chunking to demonstrations from randomized sub-optimal controllers, outperforming baselines and transferring to real robots without teleoperation or real-world training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10165","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MoRI: Mixture of RL and IL Experts for Long-Horizon Manipulation Tasks","primary_cat":"cs.RO","submitted_at":"2026-04-11T11:24:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MoRI dynamically mixes RL and IL experts with variance-based switching and IL regularization to reach 97.5% success in four real-world robotic tasks while cutting human intervention by 85.8%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10125","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PhyMix: Towards Physically Consistent Single-Image 3D Indoor Scene Generation with Implicit--Explicit Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-11T09:41:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PhyMix unifies a new multi-aspect physics evaluator with implicit policy optimization and explicit test-time correction to produce single-image 3D indoor scenes that are both visually faithful and physically plausible.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"show incremental gains, they remain ad hoc-focusing on isolated constraints without a unified treatment of physical consistency. This motivates the development of a system- atic evaluator that can cover multiple aspects of physical plausibility in a systematic way. Preference-based alignment and test-time optimization. In large language models, preference-based alignment (e.g., DPO [28], ORPO [9], GRPO [30]) has shown strong ability to guide generation without an explicit critic, and Diffusion- DPO extends this paradigm to image synthesis [34]. Con- versely, PPO [29] and AWR [27] assume sequential control with value functions and stepwise rewards, which are absent in single-step 3D scene generation; applying them thus in- troduces unnecessary value estimation and ill-posed credit"},{"citing_arxiv_id":"2604.08960","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient Hierarchical Implicit Flow Q-learning for Offline Goal-conditioned Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-10T05:04:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes mean flow policies and LeJEPA loss to overcome Gaussian policy limits and weak subgoal generation in hierarchical offline GCRL, reporting strong results on OGBench state and pixel tasks.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"flow policies are trained via advantage-weighted regression to generate subgoals and actions, respectively. During inference (right), both policies perform one-step generation from Gaussian noise, enabling efficient hierarchical decision-making without iterative sampling. 0)|x2. For policy learning, both high-level and low-level policies are extracted using advantage-weighted regression (AWR) [22], optimized according to the following ob- jective: Lπh(θh) =E (sh,sh+k)∼D, g∼µg(g) \u0002 exp βAh(sh, sh+k, g) \u0001 ×logπ h θh(sh+k |s h, g) \u0003 , Lπl(θl) =E (sh,ah,sh+1,sh+k)∼D \u0002 exp βAl(sh, sh+1, sh+k) \u0001 ×logπ l θl(ah |s h, sh+k) \u0003 , (2) whereβdenotes the inverse temperature,s h+k represents the optimalk-step sub- goal withk∈N,A h(sh, sh+k, g) =V θV(sh+k, g)−VθV(sh, g)andA l(sh, sh+1, sh+k) ="},{"citing_arxiv_id":"2604.08174","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Value-Guidance MeanFlow for Offline Multi-Agent Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-09T12:31:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VGM²P achieves SOTA-comparable performance in offline MARL via value-guided conditional behavior cloning with MeanFlow, enabling efficient single-step action generation insensitive to regularization coefficients.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06159","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Target Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-07T17:55:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TPO constructs a target distribution q proportional to the old policy times exp(utility) and trains the policy to match it via cross-entropy, matching or beating PPO and GRPO especially under sparse rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05808","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hierarchical Reinforcement Learning with Augmented Step-Level Transitions for LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-04-07T12:44:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STEP-HRL enables step-level learning in LLM agents via hierarchical task structure and local progress modules, outperforming baselines on ScienceWorld and ALFWorld while cutting token usage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.20521","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Delightful Distributed Policy Gradient","primary_cat":"cs.LG","submitted_at":"2026-03-20T21:45:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Delightful Policy Gradient gates updates with advantage times surprisal to suppress rare failures while preserving rare successes in distributed RL with stale or buggy data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.02115","ref_index":145,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robometer: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons","primary_cat":"cs.RO","submitted_at":"2026-03-02T17:38:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Robometer combines intra-trajectory progress supervision with inter-trajectory preference supervision on a 1M-trajectory dataset to learn more generalizable robotic reward functions than prior methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"imitation learning,\" inConference on Robot Learning (CoRL), 2024. [143] A. Liang, I. Singh, K. Pertsch, and J. Thomason, \"Transformer adapters for robot learning,\" inCoRL 2022 Workshop on Pre-training Robot Learning, 2022. [144] I. Kostrikov, A. Nair, and S. Levine, \"Offline reinforce- ment learning with implicit q-learning,\" inInternational Conference on Learning Representations (ICLR), 2022. [145] X. B. Peng, A. Kumar, G. Zhang, and S. Levine, \"Advantage-weighted regression: Simple and scalable off-policy reinforcement learning,\" inarXiv preprint arXiv:1910.00177, 2019. 16 APPENDIXTABLE OFCONTENTS Appendix A: Dataset Details17 A-1 IndividualRBM-1MTraining Dataset Details . . . . . . . . . . . . . . . . . . 17 A-2 Dataset Filtering and Task End-State"},{"citing_arxiv_id":"2603.00918","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Text-to-Image Generation with Intrinsic Self-Confidence Rewards","primary_cat":"cs.CV","submitted_at":"2026-03-01T04:39:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SOLACE improves text-to-image generation by using intrinsic self-confidence rewards from noise reconstruction accuracy during reinforcement learning post-training without external supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.22801","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unleashing the Potential of Diffusion Models for End-to-End Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-02-26T09:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces Hyper Diffusion Planner (HDP), a diffusion-based E2E AD framework that identifies insights on loss space, trajectory representation and data scaling, adds RL post-training, and reports 10x performance gains over 200 km of real-world testing across 6 scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.11075","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RISE: Self-Improving Robot Policy with Compositional World Model","primary_cat":"cs.RO","submitted_at":"2026-02-11T17:43:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RISE combines a controllable dynamics model and progress value model into a closed-loop self-improving pipeline that updates robot policies entirely in imagination, reporting over 35% absolute gains on three real-world tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.07389","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Non-decoupling of Supervised Fine-tuning and Reinforcement Learning in Post-training","primary_cat":"cs.LG","submitted_at":"2026-01-12T10:14:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SFT and RL cannot be decoupled in LLM post-training because each step increases the loss or lowers the reward of the prior step under KL and PL analyses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}