{"total":13,"items":[{"citing_arxiv_id":"2606.32034","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"QVal: Cheaply Evaluating Dense Supervision Signals for Long-Horizon LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-06-30T17:58:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QVal is a new evaluation framework that directly measures dense supervision quality via Q-alignment to a reference policy, showing simple prompting baselines outperform 21 other methods across environments and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32027","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Freeform Preference Learning for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-30T17:54:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Freeform Preference Learning trains language-conditioned multi-axis reward models from human pairwise preferences to produce steerable and compositional robot policies that outperform sparse and binary-preference baselines by 38 percentage points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28570","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Digitizing Coaching Intelligence: An Agentic Framework for Holistic Athlete Profiling using VLM and RAG","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:53:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Presents a hybrid agentic framework using MediaPipe, Llama-4-scout VLM, LangGraph orchestration, and RAG for holistic athlete profiling aligned with SAI protocols.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23640","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning Process Rewards via Success Visitation Matching for Efficient RL","primary_cat":"cs.LG","submitted_at":"2026-06-22T17:30:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Success Visitation Matching uses a discriminator to turn sparse outcome rewards into dense process rewards by matching visitations of successful episodes, provably preserving the optimal policy and speeding up robotic RL finetuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12072","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"World Model Self-Distillation: Training World Models to Solve General Tasks","primary_cat":"cs.CV","submitted_at":"2026-06-10T13:40:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-distillation from a caption-conditioned video diffusion model to an image-and-prompt-conditioned executor, enhanced by RL from VLM feedback, enables task solving in world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20641","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MAGNIFIED: RL Fine-tuning of Multimodal Large Language Models for Motion Planning","primary_cat":"cs.RO","submitted_at":"2026-06-02T16:07:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAGNIFIED applies RL fine-tuning to MLLMs for autonomous driving motion planning, yielding over 10.5% lower overlap rate and 38.9% lower off-road rate than SFT baseline on Waymo Open Motion Dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00931","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CV-Arena: An Open Benchmark for Instructional Computer Vision Problem Solving with Human-AI Collaborative Preferences","primary_cat":"cs.CV","submitted_at":"2026-05-30T23:37:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CV-Arena is a new 12K-pair benchmark for instruction-guided real-image editing with 16 task types, CogRetriever curation, and Active Elo mixed human-AI evaluation that finds gaps in 21 models and presents CV-Agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22123","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Pixels: Learning Invariant Rewards for Real-World Robotics From a Few Demonstrations","primary_cat":"cs.RO","submitted_at":"2026-05-21T07:55:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A framework learns invariant symbolic reward functions from few demonstrations that generalize zero-shot to variations in robotic manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17933","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AtlasVA: Self-Evolving Visual Skill Memory for Teacher-Free VLM Agents","primary_cat":"cs.CV","submitted_at":"2026-05-18T06:41:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AtlasVA organizes VLM agent memory into spatial heatmaps, visual exemplars, and symbolic skills, evolving atlases from trajectories to act as potential-based shaping rewards in teacher-free reinforcement learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18903","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reasoning Portability: Guiding Continual Learning for MLLMs in the RLVR Era","primary_cat":"cs.LG","submitted_at":"2026-05-17T13:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Formalizes Reasoning Portability (RP) and proposes RDB-CL to modulate per-sample KL regularization in RLVR for MLLM continual learning, achieving +12.0% Last accuracy over vanilla RLVR baseline by preserving reusable reasoning on high-RP samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.19837","ref_index":182,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Meta-Learning and Meta-Reinforcement Learning -- Tracing the Path towards DeepMind's Adaptive Agent","primary_cat":"cs.AI","submitted_at":"2026-02-23T13:39:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey provides a task-based formalization of meta-learning and meta-RL while chronicling algorithms that lead to DeepMind's Adaptive Agent.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"In2025 IEEE 6th International Seminar on Artificial Intelligence, Networking and Information Technology (AINIT), pages 1-6, April 2025. [181] Zifan Xu, Yulin Zhang, Shahaf S. Shperberg, Reuth Mirsky, Yuqian Jiang, Bo Liu, and Peter Stone. Model-Based Meta Automatic Curriculum Learning. InProceedings of The 2nd Conference on Lifelong Learning Agents, pages 846-860. PMLR, November 2023. ISSN: 2640-3498. [182] Weirui Ye, Yunsheng Zhang, Haoyang Weng, Xianfan Gu, Shengjie Wang, Tong Zhang, Mengchen Wang, Pieter Abbeel, and Yang Gao. Reinforcement Learning with Foundation Priors: Let the Embodied Agent Efficiently Learn on Its Own, October 2024. arXiv:2310.02635 [cs]. [183] Shuolei Yin, Yejing Xi, Xun Zhang, Chengnuo Sun, and Qirong Mao. Foundation Models in"},{"citing_arxiv_id":"2510.12710","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reflection-Based Task Adaptation for Self-Improving VLA","primary_cat":"cs.RO","submitted_at":"2025-10-14T16:44:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reflective Self-Adaptation combines failure-reflective reinforcement learning with success-guided imitation learning to enable faster and more reliable task adaptation for pre-trained Vision-Language-Action models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.11196","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UAV-VL-R1: Generalizing Vision-Language Models via Supervised Fine-Tuning and Multi-Stage GRPO for UAV Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2025-08-15T04:06:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UAV-VL-R1 combines SFT and multi-stage GRPO reinforcement learning on a new 50,019-sample HRVQA-VL dataset to deliver substantially higher zero-shot accuracy on UAV visual reasoning tasks than both its 2B baseline and a 72B-scale model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}