{"total":33,"items":[{"citing_arxiv_id":"2606.27295","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LA4VLA: Learning to Act without Seeing via Language-Action Pretraining","primary_cat":"cs.RO","submitted_at":"2026-06-25T17:13:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LA4VLA creates a 33K language-action dataset from existing demos and shows that pretraining on language-action pairs before or alongside vision-language-action training boosts success rates in sim and real robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25813","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Extending Embodied Question Answering from Perception to Decision","primary_cat":"cs.RO","submitted_at":"2026-05-25T13:08:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces EQA-Decision dataset with 4M+ QA pairs across four embodied reasoning dimensions and RoboDecision baseline for joint perception-reasoning-decision evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22816","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AwareVLN: Reasoning with Self-awareness for Vision-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-21T17:58:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AwareVLN introduces a structural reasoning module and automatic data engine with progress division to equip VLN agents with self-awareness of agent state and task progress, outperforming prior methods on Habitat datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22812","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GesVLA: Gesture-Aware Vision-Language-Action Model Embedded Representations","primary_cat":"cs.RO","submitted_at":"2026-05-21T17:57:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GesVLA encodes gesture features directly into the latent space of VLA models using a dual-VLM architecture and a rendering-based data pipeline, yielding improved target grounding in real robotic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22183","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Action with Visual Primitives","primary_cat":"cs.RO","submitted_at":"2026-05-21T08:52:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AVP architecture has VLM emit visual-primitive tokens to condition flow-matching action expert, yielding 27.61% higher success rate than pi_0.5 on real-robot pick-and-place tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17522","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoboFlow4D: A Lightweight Flow World Model Toward Real-Time Flow-Guided Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-17T16:11:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RoboFlow4D is an end-to-end lightweight flow world model that predicts multi-frame 3D flows from visual observations and textual instructions to provide explicit planning for real-time robotic manipulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17486","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DyGRO-VLA: Cross-Task Scaling of Vision-Language-Action Models via Dynamic Grouped Residual Optimization","primary_cat":"cs.RO","submitted_at":"2026-05-17T14:55:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DyGRO-VLA is a two-stage optimization framework for cross-task scaling of Vision-Language-Action models via dynamic grouped residual optimization in RL.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13119","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Long-horizon Embodied Agents with Tool-Aligned Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-13T07:40:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLAs-as-Tools pairs a VLM planner with specialized VLA executors via a new interface and Tool-Aligned Post-Training to raise long-horizon robot success rates on LIBERO-Long and RoboTwin benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09959","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"G-Zero: Self-Play for Open-Ended Generation from Zero Data","primary_cat":"cs.LG","submitted_at":"2026-05-11T04:12:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"G-Zero uses the Hint-δ intrinsic reward to drive co-evolution between a Proposer and Generator via GRPO and DPO, providing a theoretical suboptimality guarantee for self-improvement from internal dynamics alone.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"However, this paradigm relies crucially on the existence of programmatic oracles. In domains like mathematics or code generation, deterministic signals, such as numerical correctness or functional execution, provide the ground truth required for Reinforcement Learning from Verifiable Rewards (RLVR) [23, 7]. Conversely, a broad class of real-world scenarios, including open-ended instruction following [24], multi-turn dialogue [34], and creative writing, lack such objective oracles. To navigate these settings, existing methods frequently rely on LLM-as-a-judge [6] mechanisms for surrogate reward signals. This workflow introduces two critical limitations. First, the evolving model's perfor- mance ceiling is fundamentally bottlenecked by the judge's capabilities."},{"citing_arxiv_id":"2605.07465","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SEIF: Self-Evolving Reinforcement Learning for Instruction Following","primary_cat":"cs.CL","submitted_at":"2026-05-08T09:13:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEIF creates a self-reinforcing loop in which an LLM alternately generates increasingly difficult instructions and learns to follow them better using reinforcement learning signals from its own judgments.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Dirani, Julian Michael, and Samuel R Bowman. Gpqa: A graduate-level google-proof q&a benchmark.arXiv preprint arXiv:2311.12022, 2023. [28] Qingyu Ren, Qianyu He, Powei Chang, Jie Zeng, Zeye Sun, Fei Yu, Jiaqing Liang, and Yanghua Xiao. Instructions are all you need: Self-supervised reinforcement learning for instruction following.arXiv preprint arXiv:2510.14420, 2025. 11 [29] Lucy Xiaoyang Shi, Brian Ichter, Michael Equi, Liyiming Ke, Karl Pertsch, Quan Vuong, James Tanner, Anna Walling, Haohuan Wang, Niccolo Fusai, et al. Hi robot: Open-ended instruction following with hierarchical vision-language-action models.arXiv preprint arXiv:2502.19417, 2025. [30] Haoran Sun, Lixin Liu, Junjie Li, Fengyu Wang, Baohua Dong, Ran Lin, and Ruohui Huang."},{"citing_arxiv_id":"2604.24447","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Characterizing Vision-Language-Action Models across XPUs: Constraints and Acceleration for On-Robot Deployment","primary_cat":"cs.RO","submitted_at":"2026-04-27T13:12:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VLA models exhibit a compute-bound VLM phase followed by a memory-bound action phase on edge hardware; DP-Cache and V-AEFusion reduce redundancy and enable pipeline parallelism for up to 6x speedup on NPUs with marginal task degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15483","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"${\\pi}_{0.7}$: a Steerable Generalist Robotic Foundation Model with Emergent Capabilities","primary_cat":"cs.LG","submitted_at":"2026-04-16T19:18:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"π₀.₇ is a steerable generalist robotic model that uses rich multimodal prompts including language, subgoal images, and performance metadata to achieve out-of-the-box generalization across tasks and robot bodies.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Do as i can, not as i say: Grounding language in robotic affordances.Conference on Robot Learning (CoRL), 2022. 3 [39] Jacky Liang, Wenlong Huang, Fei Xia, Peng Xu, Karol Hausman, Brian Ichter, Pete Florence, and Andy Zeng. Code as policies: Language model programs for embod- ied control.IEEE International Conference on Robotics and Automation (ICRA), 2023. [40] Lucy Xiaoyang Shi, Brian Ichter, Michael Equi, Liy- iming Ke, Karl Pertsch, Quan Vuong, James Tanner, Anna Walling, Haohuan Wang, Niccolo Fusai, et al. Hi robot: Open-ended instruction following with hier- archical vision-language-action models.arXiv preprint arXiv:2502.19417, 2025. 3 [41] Qingqing Zhao, Yao Lu, Moo Jin Kim, Zipeng Fu, Zhuoyang Zhang, Yecheng Wu, Zhaoshuo Li, Qianli"},{"citing_arxiv_id":"2604.14125","ref_index":32,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HiVLA: A Visual-Grounded-Centric Hierarchical Embodied Manipulation System","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HiVLA decouples VLM-based semantic planning with visual grounding from a cascaded cross-attention DiT action expert, outperforming end-to-end VLAs on long-horizon and fine-grained manipulation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"To circumvent this limitation, hierarchical models explicitly decouple high- level task planning from low-level policy execution via interpretable interme- diate representations. This modularity retains the VLM's zero-shot reasoning power while allowing the action expert to specialize in precise motor control. These intermediate bridges take various forms, including textual subtasks in Hi- Robot [32]and MemER [34] or spatial keypoints in HAMSTER [22]. By isolating cognitive processes from high-frequency control, hierarchical systems provide a robust and scalable foundation for advancing embodied intelligence. 2.2 Visual-Grounded-Centric VLA A critical challenge in manipulation is precise visual grounding, which accurately maps high-level instructions to specific spatial regions within the visual input."},{"citing_arxiv_id":"2604.10432","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AnySlot: Goal-Conditioned Vision-Language-Action Policies for Zero-Shot Slot-Level Placement","primary_cat":"cs.RO","submitted_at":"2026-04-12T03:09:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07774","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RoboAgent: Chaining Basic Capabilities for Embodied Task Planning","primary_cat":"cs.RO","submitted_at":"2026-04-09T04:01:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RoboAgent chains basic vision-language capabilities inside a single VLM via a scheduler and trains it in three stages (behavior cloning, DAgger, RL) to improve embodied task planning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Hugginggpt: Solv- ing ai tasks with chatgpt and its friends in hugging face. Advances in Neural Information Processing Systems, 36: 38154-38180, 2023. 2 [88] Junhao Shi, Zhaoye Fei, Siyin Wang, Qipeng Guo, Jingjing Gong, and Xipeng QIu. World-aware planning narra- tives enhance large vision-language model planner.arXiv preprint arXiv:2506.21230, 2025. 2, 7 [89] Lucy Xiaoyang Shi, Brian Ichter, Michael Equi, Liyim- ing Ke, Karl Pertsch, Quan Vuong, James Tanner, Anna Walling, Haohuan Wang, Niccolo Fusai, et al. Hi robot: Open-ended instruction following with hierarchical vision- language-action models.arXiv preprint arXiv:2502.19417, 2025. 1 [90] Suyeon Shin, Sujin Jeon, Junghyun Kim, Gi-Cheon Kang, and Byoung-Tak Zhang."},{"citing_arxiv_id":"2604.05320","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ExpressMM: Expressive Mobile Manipulation Behaviors in Human-Robot Interactions","primary_cat":"cs.RO","submitted_at":"2026-04-07T01:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ExpressMM integrates high-level language-guided planning with low-level vision-language-action policies to enable expressive and interruptible mobile manipulation behaviors in human-robot collaboration, shown effective in an assembly task via audience evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02786","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"QuadAgent: A Responsive Agent System for Vision-Language Guided Quadrotor Agile Flight","primary_cat":"cs.RO","submitted_at":"2026-04-03T06:50:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QuadAgent uses an asynchronous multi-agent architecture with an Impression Graph for scene memory and vision-based avoidance to enable training-free vision-language guided agile quadrotor flight, outperforming baselines in simulations and achieving real-world speeds up to 5 m/s.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"by enabling a high-level VLM to asynchronously update natural language instructions for a low-level VLA, achiev- ing seamless instruction integration. However, this approach necessitates a robust VLA model and the fine-tuning of the upper-level VLM. Recently, several approaches have explored spatial reason- ing using 3D scene graphs. For example, ConceptGraphs [24] construct object-centric 3D graphs using open-vocabulary segmentation and dense 3D mapping, and leverage LLMs to infer spatial relationships. Beyond object-level reasoning, subsequent works organize objects into manually defined hierarchical structures [25], [26], [27]. However, constructing such 3D scene graphs typically requires dense reconstruction"},{"citing_arxiv_id":"2603.25044","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ThermoAct:Thermal-Aware Vision-Language-Action Models for Robotic Perception and Decision-Making","primary_cat":"cs.RO","submitted_at":"2026-03-26T05:26:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ThermoAct integrates thermal imaging into VLA models via a VLM planner to enable robots to perceive physical properties like heat and improve safety over vision-only systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.22003","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VP-VLA: Visual Prompting as an Interface for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-03-23T14:08:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VP-VLA decouples high-level reasoning from low-level control in VLA models by rendering spatial anchors as visual prompts directly in the RGB observation space, outperforming end-to-end baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.14371","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OxyGen: Unified KV Cache Management for VLA Inference under Multi-Task Parallelism","primary_cat":"cs.RO","submitted_at":"2026-03-15T13:23:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OxyGen unifies KV cache management in MoT VLAs to enable cross-task KV sharing and cross-frame continuous batching, delivering up to 3.7x speedup with 200+ tokens/s language and 70 Hz action on on-device platforms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.20231","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniLACT: Depth-Aware RGB Latent Action Learning for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-02-23T18:41:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniLACT improves VLA models by adding depth-aware unified latent action pretraining that outperforms RGB-only baselines on seen and unseen manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15922","ref_index":74,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"World Action Models are Zero-shot Policies","primary_cat":"cs.RO","submitted_at":"2026-02-17T15:04:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DreamZero uses a 14B video diffusion model as a World Action Model to achieve over 2x better zero-shot generalization on real robots than state-of-the-art VLAs, real-time 7Hz closed-loop control, and cross-embodiment transfer with 10-30 minutes of data.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"5: a vision-language-action model with open-world generalization.arXiv preprint arXiv:2504.16054, 2025. 4, 5, 10 [73] Lucy Xiaoyang Shi, Brian Ichter, Michael Equi, Liyiming Ke, Karl Pertsch, Quan Vuong, James Tanner, Anna Walling, Haohuan Wang, Niccolo Fusai, et al. Hi robot: Open-ended instruction following with hierarchical vision-language-action models.arXiv preprint arXiv:2502.19417, 2025. 19 [74] Ishika Singh, Valts Blukis, Arsalan Mousavian, Ankit Goyal, Danfei Xu, Jonathan Tremblay, Dieter Fox, Jesse Thomason, and Animesh Garg. Progprompt: Generating situated robot task plans using large language models. In2023 IEEE International Conference on Robotics and Automation (ICRA), 2023. 4 [75] Generalist AI Team. Gen-0: Embodied foundation models that scale with physical interaction."},{"citing_arxiv_id":"2602.13193","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Steerable Vision-Language-Action Policies for Embodied Reasoning and Hierarchical Control","primary_cat":"cs.RO","submitted_at":"2026-02-13T18:57:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Steerable VLAs trained on rich synthetic commands at subtask, motion, and pixel levels enable VLMs to steer robot behavior more effectively, outperforming prior hierarchical baselines on real-world manipulation and generalization tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20857","ref_index":242,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","primary_cat":"cs.CL","submitted_at":"2025-11-25T21:08:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Evo-Memory is a new streaming benchmark and evaluation framework for self-evolving memory in LLM agents, unifying over ten memory modules and introducing the ReMem pipeline for continual improvement on multi-turn and reasoning datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.14759","ref_index":87,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$\\pi^{*}_{0.6}$: a VLA That Learns From Experience","primary_cat":"cs.LG","submitted_at":"2025-11-18T18:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RECAP enables a generalist VLA to self-improve via advantage-conditioned RL on mixed real-world data, more than doubling throughput and halving failure rates on hard manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.13778","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InternVLA-M1: A Spatially Guided Vision-Language-Action Framework for Generalist Robot Policy","primary_cat":"cs.RO","submitted_at":"2025-10-15T17:30:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVLA-M1 uses spatially guided pre-training on 2.3M examples followed by action post-training to deliver up to 17% gains on robot manipulation benchmarks and 20.6% on unseen objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.10125","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ctrl-World: A Controllable Generative World Model for Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2025-10-11T09:13:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A controllable world model trained on the DROID dataset generates consistent multi-view robot trajectories for over 20 seconds and improves generalist policy success rates by 44.7% via imagined trajectory fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16815","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning","primary_cat":"cs.CV","submitted_at":"2025-07-22T17:59:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ThinkAct introduces reinforced visual latent planning in a dual VLA system to enable better long-horizon reasoning and adaptation for embodied tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.04447","ref_index":100,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DreamVLA: A Vision-Language-Action Model Dreamed with Comprehensive World Knowledge","primary_cat":"cs.CV","submitted_at":"2025-07-06T16:14:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DreamVLA uses dynamic-region-guided world knowledge prediction, block-wise attention to disentangle information types, and a diffusion transformer for actions, reaching 76.7% success on real robot tasks and 4.44 average length on CALVIN ABC-D.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Ran Cheng, Yaxin Peng, Chaomin Shen, et al. Chatvla: Unified multimodal understanding and robot control with vision-language-action model. arXiv preprint arXiv:2502.14420, 2025. 3 [99] Fanqi Lin, Ruiqian Nai, Yingdong Hu, Jiacheng You, Junming Zhao, and Yang Gao. Onet- wovla: A unified vision-language-action model with adaptive reasoning. arXiv preprint arXiv:2505.11917, 2025. [100] Lucy Xiaoyang Shi, Brian Ichter, Michael Equi, Liyiming Ke, Karl Pertsch, Quan Vuong, James Tanner, Anna Walling, Haohuan Wang, Niccolo Fusai, et al. Hi robot: Open- ended instruction following with hierarchical vision-language-action models. arXiv preprint arXiv:2502.19417, 2025. 3 [101] Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal,"},{"citing_arxiv_id":"2507.01925","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey on Vision-Language-Action Models: An Action Tokenization Perspective","primary_cat":"cs.RO","submitted_at":"2025-07-02T17:34:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The survey frames VLA models as pipelines that generate progressively grounded action tokens and classifies those tokens into eight types to guide future development.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"In addition, parallel efforts have sought to extend the scaling laws [19, 20] observed in vision and language domains to the embodied setting, collecting large-scale embodied datasets and training generalist agents end-to-end on top of vision-language foundation models [21, 22, 23]. These diverse approaches have led to a rapid proliferation of VLA models in robotic manipulation [24, 25], navigation [26, 27], and autonomous driving [28, 29, 30], demonstrating promising capabilities in multitask learning [31], long-horizon task completion [22], and strong generalization [32]. By leveraging foundation model intelligence, they offer new directions for addressing long-standing challenges in embodied AI, such as data scarcity and poor cross-embodiment transferability, and pave the way foragents"},{"citing_arxiv_id":"2504.16054","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","primary_cat":"cs.LG","submitted_at":"2025-04-22T17:31:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"π_{0.5} is a VLA model that achieves long-horizon dexterous manipulation in entirely new homes through co-training on heterogeneous tasks and multi-source data including web and semantic predictions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.15558","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning","primary_cat":"cs.AI","submitted_at":"2025-03-18T22:06:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cosmos-Reason1-7B and 56B models are trained with physical common sense and embodied reasoning ontologies via supervised fine-tuning and reinforcement learning to produce next-step physical actions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.03480","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeVLA: Towards Safety Alignment of Vision-Language-Action Model via Constrained Learning","primary_cat":"cs.RO","submitted_at":"2025-03-05T13:16:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SafeVLA applies constrained reinforcement learning via CMDP min-max optimization to VLAs, cutting safety violation costs by 83.58% while preserving task success on long-horizon mobile manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}