{"total":1389,"items":[{"citing_arxiv_id":"2606.27981","ref_index":229,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToxiREX: A Dataset on Toxic REasoning in ConteXt","primary_cat":"cs.CL","submitted_at":"2026-06-26T11:30:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToxiREX is a new dataset of 128k Reddit comments in six languages with hierarchical annotations for implicit toxicity in conversational context based on an existing reasoning schema.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25978","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Agent Goal Recognition with Team- and Goal-Conditioned Reinforcement Learning and Factorized Branch-and-Bound","primary_cat":"cs.MA","submitted_at":"2026-06-24T15:50:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAGR-BB matches exhaustive search accuracy on multi-agent Blocksworld while reducing hypothesis evaluations by orders of magnitude via RL scoring inside factorized branch-and-bound.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23257","ref_index":80,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic multi-agent deep reinforcement learning-based pricing and incentivization approach in multimodal transportation networks","primary_cat":"cs.LG","submitted_at":"2026-06-22T12:39:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Multi-agent DRL framework shows dynamic incentives and pricing can cut commuter costs ~20%, emissions ~10%, and double public transport profit in simulated morning peak scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21387","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Long-Distance Real-World Navigation of the Legged-Wheeled Robot Go2-W Using Deep Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-06-19T12:53:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A DRL locomotion controller extended from prior quadruped work enabled the Go2-W robot to complete 2.8 km of autonomous real-world navigation including mixed terrain and stairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18625","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SRL: Combining SLIP Model and Reinforcement Learning for Agile Robotic Jumping","primary_cat":"cs.RO","submitted_at":"2026-06-17T02:41:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SRL combines SLIP feedforward with RL feedback to produce stable bipedal and quadrupedal jumps with lower training cost than pure RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11525","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Object Manipulation from Scratch via Contrastive Interaction","primary_cat":"cs.RO","submitted_at":"2026-06-10T00:06:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"IWR improves CRL sample efficiency and performance in interaction-rich manipulation by interaction-aware resampling that preserves mode boundaries, yielding 19.8% average gains and a real-world air-hockey agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11167","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Faceted Interactivity Alignment in Full-Duplex Speech Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T17:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-axis RL alignment technique improves pause handling, turn-taking, backchanneling, and interruption response in full-duplex spoken dialogue models by optimizing axis-specific rewards derived from human audio segments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20658","ref_index":266,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Expected Free Energy-based Planning as Variational Inference","primary_cat":"cs.AI","submitted_at":"2026-06-09T08:09:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EFE-based planning is formulated as variational free energy minimization with epistemic priors, decomposing into expected plan costs plus a complexity term.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09439","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tracking the Effective Surface Area of Non-Convex Satellites","primary_cat":"eess.SY","submitted_at":"2026-06-08T12:48:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Backstepping control tracks effective surface area of non-convex satellites for drag-based orbital control, with asymptotic stability proofs and an extension for solar panel exposure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08816","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Knowledge Graphs and Reasoning LLMs for Finding Simple Yet Effective Transcriptomic Perturbation Predictors","primary_cat":"cs.LG","submitted_at":"2026-06-07T20:09:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"K-nearest neighbor from a knowledge graph beats most methods on out-of-distribution transcriptomic perturbation prediction, and an RL-trained reasoning LLM matches SOTA on Replogle et al. (2022) cell lines while improving downstream differential expression prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08729","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IR-SIM: A Lightweight Skill-Native Simulator for Navigation, Learning, and Benchmarking","primary_cat":"cs.RO","submitted_at":"2026-06-07T16:55:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IR-SIM is a YAML-defined simulator for mobile robot navigation that supports text-prompt scenario creation, policy training, benchmarking, and bridging to higher-fidelity or real-world settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07513","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentopia: Long-Term Life Simulation and Learning in Agent Societies","primary_cat":"cs.CL","submitted_at":"2026-06-05T17:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agentopia runs decade-scale multi-agent LLM simulations to study emergent social behaviors and trains models with life-reward rejection sampling, yielding +15.6% gains on role-playing benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05882","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Market Informedness and Market-Maker Profitability: The Trade-Off Between Adverse Selection and Price Discovery","primary_cat":"q-fin.TR","submitted_at":"2026-06-04T08:53:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agent-based model with multi-agent RL shows market-maker profitability trends upward overall with rising aggregate market informedness as price-discovery benefits offset adverse-selection costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05800","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SALT: When More Rollouts Don't Help in Group-Based Policy Optimization and How to Make Them Matter","primary_cat":"cs.LG","submitted_at":"2026-06-04T07:29:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SALT is a subspace-adaptive plug-in for GRPO that decomposes group-relative coefficients into shared and residual channels using mini-batch Gram geometry and amplifies residuals to mitigate signed cancellation in RLVR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05722","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AISC deployment in dynamic UAV-assisted MEC network: a reinforcement learning method based on heterogeneous graph attention neural network","primary_cat":"cs.NI","submitted_at":"2026-06-04T05:26:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A heterogeneous graph attention Q-network is introduced for AISC deployment that reduces completion time while improving load balance and energy use in dynamic UMEC networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04935","ref_index":290,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Type of Inference is Active Inference?","primary_cat":"cs.AI","submitted_at":"2026-06-03T14:24:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EFE-based active inference planning is characterized as VFE on an augmented model plus entropy and planning corrections, with a derived message-passing implementation and grid-world validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04735","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Trace-Mediated Peak Bias: Bridging Temporal Credit Assignment and Cognitive Heuristics in Deep Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-03T11:19:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Eligibility traces in deep RL create a peak bias by amplifying distal TD errors into gradient shocks that fixed-step SGD cannot normalize, leading to overestimation of peak-reward trajectories and a mechanistic account of the peak-end rule.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04574","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Multi-Pair Trading Strategy in Cryptocurrency Markets with Deep Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-06-03T08:10:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A hybrid DRL system for multi-pair crypto trading with deterministic risk shielding outperforms a heuristic baseline at 10% significance on Binance futures data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04471","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Optimizing Control of Continuous Processes Based on Reinforcement Learning","primary_cat":"eess.SY","submitted_at":"2026-06-03T05:33:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reinforcement learning optimizes controlled variable selection for self-optimizing control by embedding the structure in an actor network and using economic rewards, showing better dynamic performance than a steady-state baseline in a CSTR simulation under disturbances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01565","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hierarchical Semantic-Augmented Navigation: Optimal Transport and Graph-Driven Reasoning for Vision-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-01T02:11:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"HSAN integrates hierarchical semantic graphs, optimal transport-based goal selection, and graph-aware RL to claim SOTA results on VLN-CE tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01332","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S2M-Trek: From Single to Multi-Sphere Transport via Per-Frame Deep Sets on a Wheel-Legged Robot","primary_cat":"cs.RO","submitted_at":"2026-05-31T16:35:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Per-Frame Deep Sets enables scaling single-sphere to five-sphere transport on a quadruped by performing permutation-invariant pooling within each history frame, reaching 100% no-drop success in simulation where standard encoders plateau.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01028","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MedGym:A Unified Continuous-Time Benchmark for Dynamic Medical Treatment Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-31T05:36:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedGym introduces a continuous-time RL benchmark for medical treatment derived from clinical data via PINNs, supporting offline/online evaluation on personalization, safety, and discrete vs continuous methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00950","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COLLIE: Guiding Skill Discovery in Semantically Coherent Latent Space","primary_cat":"cs.LG","submitted_at":"2026-05-31T02:04:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COLLIE constructs a semantically coherent skill latent space from unsupervised data to enable training-free guidance with sparse online feedback in guided skill discovery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02636","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Too Much of a Good Thing: When sim2real Efforts Impede Policy Learning (And What to Do About It)","primary_cat":"cs.RO","submitted_at":"2026-05-30T22:17:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Excessive sim2real focus impedes robotics policy learning via simulator lock-in; a kinematics-only sim2sim2real paradigm is proposed to restore exploration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00880","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Task diversity produces systematic transfer but inhibits continual reinforcement learning","primary_cat":"cs.LG","submitted_at":"2026-05-30T20:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Task diversity along map, object, and hierarchy axes produces local transfer across shifts in a new continual RL benchmark but fails to sustain learning as the number of shifts grows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00840","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Certificate-Guided Evaluation of Reinforcement Learning Generalization","primary_cat":"cs.AI","submitted_at":"2026-05-30T18:31:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A logic-driven framework defines inductive reach-avoid tasks and uses neural certificates to certify RL generalization, with empirical results linking fewer violations to more solved test tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00702","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Shape Your Body: Value Gradients for Multi-Embodiment Robot Design","primary_cat":"cs.RO","submitted_at":"2026-05-30T12:21:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Trains embodiment-aware value functions on up to 50 robots and applies their gradients as differentiable surrogates to optimize held-out robot designs with over 1100 parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00674","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Paradox of Outcome Optimization: A Causal Information-Theoretic Bound on Reasoning Shortcuts in LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-30T11:06:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Outcome optimization induces reward-induced manifold collapse in LLMs by favoring low-complexity spurious correlations over high-complexity causal reasoning, with process reward models acting as topological filters to block shortcuts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00637","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Global-Local Attention Decomposition for Terrain Encoding in Humanoid Perceptive Locomotion","primary_cat":"cs.RO","submitted_at":"2026-05-30T09:23:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GLAD decomposes terrain encoding via coarse-to-fine attention on elevation maps to separate broad awareness from precise foothold selection in perceptive humanoid locomotion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00595","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Elastohydrodynamic coupling enhances flow generation by coordinated ciliary beating","primary_cat":"physics.bio-ph","submitted_at":"2026-05-30T07:52:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reinforcement learning on a bead-spring cilia model identifies antiplectic coordination as flow-maximizing, with a tilted-slider reduced model showing that a time-averaged position shift opposite the effective stroke enhances transport via elastic restoring force coupling, and that symplectic coordi","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00593","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPADER: Step-wise Peer Advantage with Diversity-Aware Exploration Rewards for Multi-Answer Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-30T07:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPADER proposes step-wise peer advantage and diversity-aware exploration rewards in RL for multi-answer QA, reporting improved recall and F1 on QAMPARI, Mintaka, WebQSP, and QUEST.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00583","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Visual Representation Alignment Generation with GRPO","primary_cat":"cs.CV","submitted_at":"2026-05-30T07:21:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VRPO applies generative representation policy optimization to dynamically align diffusion features with pretrained visual encoders, claiming +1.8 FID gains and 2.3x faster training versus REPA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00440","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SDR: Set-Distance Rewards for Radiology Report Generation","primary_cat":"cs.AI","submitted_at":"2026-05-30T00:10:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Set-to-set distances on sentence embeddings provide a permutation-invariant reward signal that improves GRPO training and enables efficient test-time scaling for vision-language models generating chest X-ray reports.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00400","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Proxy-Mixing: Transferring Replay Controllers from Small to Large Models for Continual Instruction Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-29T22:32:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PROXYMIX learns a dynamic replay controller on a small proxy model and transfers it to a large target model, improving accuracy by 3.4 points and reducing forgetting by 3.5 points on LLaMA-3-8B continual tuning sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00270","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Robust Shielding for Safe Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-29T19:01:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A sound and optimal shielding method for robust MDPs ensures LTL safety under worst-case transitions and combines with PAC sampling to produce minimally restrictive shields for learned models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31494","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consolidating Rewarded Perturbations for LLM Post-Training","primary_cat":"cs.CL","submitted_at":"2026-05-29T16:16:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoRP consolidates reward-weighted perturbations into a single model via low-rank structure, improving base LLMs by 8.1 points on average while using one-tenth the budget of prior ensembles and one forward pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31455","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Decoupled Rollouts and Importance-Weighted Fine-Tuning for Efficient Multi-Turn Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T15:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT achieves multi-turn RL performance via offline importance-weighted SFT by leveraging the equivalence of KL-regularized RL to weighted supervised learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31312","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from Fine-Grained Visual Discrepancies: Mitigating Multimodal Hallucinations via In-Context Visual Contrastive Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-29T13:44:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IC-VCO places contrastive images in one context for a consistent DPO-style objective, adds Visual Contrast Distillation, and uses semantic perturbation for hard negatives, reporting best results on five benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31261","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Linear Recurrent Memory Works in Partially Observable Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T12:56:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Linear recurrent filters exactly reproduce HMM belief logits under deterministic transitions and achieve near-zero decoding error under nearly deterministic ones, extending to action-controlled cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07602","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-Efficient Post-Training for LEGO Spatial-Physics Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PVPO is a sample-efficient RL method that improves semantic, geometric, and physical quality in LLM LEGO assembly generation by mitigating the PhysHack failure mode where validity alone fails to ensure fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31023","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HADT: A Heterogeneous Multi-Agent Differential Transformer for Autonomous Earth Observation Satellite Cluster","primary_cat":"cs.AI","submitted_at":"2026-05-29T08:54:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes HADT, a heterogeneous multi-agent differential transformer with relational observations-actions tokenization for model-free RL-based autonomous resource management in EO satellite clusters, claiming gains over baselines and adaptability to cluster size changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30957","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RDGen: Demonstration Generation for High-Quality Robot Learning via Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-29T07:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RDGen uses sim-to-real RL policies to generate smoother robot demonstrations that improve downstream VLA performance over human-collected data on pick-and-place tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30919","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"De-attribute to Forget for LLM Unlearning","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:03:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DareU reframes LLM unlearning as zeroing data attribution via RL rewards from an LLM classifier approximation, claiming better balance of forget quality and model utility than loss-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30916","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Welfare, Improvability, and Variance: A Principal-Agent Approach to Optimal Benchmark Item Aggregation","primary_cat":"cs.LG","submitted_at":"2026-05-29T07:01:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Models benchmarking as principal-agent game, derives welfare loss from welfare alignment, improvability and variance, and applies an audit framework to OLMES items.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30914","ref_index":100,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Automating Formal Verification with Reinforcement Learning and Recursive Inference","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:59:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLVR training raises verified Dafny pass rates from 9.7% to 31.1% on a filtered benchmark while a Lean proof scaffold lifts success from 46.2% to 69.2% on a pilot set and solves 7 of 42 prior unsolved tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30896","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Zero Collapse: A Failure Mode of Policy Gradient Methods in Discontinuous Reward Environments","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:29:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Policy gradient methods suffer from zero collapse in discontinuous reward environments such as first-price auctions, where exploration causes policies to enter flat zero-reward regions from which recovery is sample-inefficient due to absent gradient signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30873","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Federated Variational Preference Alignment with Gumbel-Softmax Prior for Personalized User Preferences","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:52:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FedVPA-GP applies variational preference learning in a federated setting with a mixture prior and orthogonal loss to disentangle user preferences on the HH-RLHF dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30859","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DARTS: Distribution-Aware Active Rollout Trajectory Shaping for Accelerating LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-05-29T05:31:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DARTS accelerates LLM RL training up to 1.77x by distribution-aware trajectory sampling and adaptive redundancy allocation that shapes rollouts toward conciseness without performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30795","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Feat2Go: Visual Feature-Grounded Value Estimation for Embodied Reinforcement Learning","primary_cat":"cs.RO","submitted_at":"2026-05-29T03:36:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Feat2Go uses patch-level similarity from a visual world model and trend-based clustering to create progress targets for training value models that improve reward shaping in embodied RL for VLA policies, yielding large gains on ManiSkill3 and RoboTwin benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}