{"total":18,"items":[{"citing_arxiv_id":"2606.29089","ref_index":31,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TAP-VLA: Tactile Annotation Prompting for Vision Language Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-27T21:06:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TAP-VLA improves VLA performance in contact-rich manipulation by visually annotating tactile shear fields onto input images, reaching 78% success versus under 50% for vision-only and other tactile methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27871","ref_index":12,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LocalNav: Distilling Frontier VLMs and Embodied RL for On-Device Object Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:11:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Distillation from frontier VLMs plus E-RLVR regularization produces a 4B local model that achieves 34.5% SR on OVON while cutting inference latency by 82.8%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27355","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"RouterVLA: Turning Smoke Tests into Supervision for Heterogeneous VLA Selection","primary_cat":"cs.RO","submitted_at":"2026-06-25T17:56:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RouterVLA reports that a simple probe-success rule from outcome-separated smoke tests raises held-out VLA success by 14.64pp on 34,752 LIBERO-Plus records, with learned scorers adding no further gain.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26423","ref_index":11,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CoStream: Composing Simple Behaviors for Generalizable Complex Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-24T22:25:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoStream composes semantic, predictive, and reactive behaviors on an SE(3) interface to enable precise, generalizable performance on eight real-world contact-rich manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25215","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Reflective VLA: In-Context Action Consequences Make VLAs Generalize","primary_cat":"cs.CV","submitted_at":"2026-06-23T22:23:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reflective VLA improves VLA generalization on LIBERO-Plus and LIBERO-Plus-Hard by 5.4 and 4.2 percentage points by conditioning on action consequences instead of reactive single-frame inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23623","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"dVLA-RL: Reinforcement Learning over Denoising Trajectories for Discrete Diffusion Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-22T17:19:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dVLA-RL models denoising as an MDP to enable RL on dVLAs via trajectory probabilities, reporting 99.7% success on LIBERO and 30.6% gains over SFT on RoboTwin 2.0.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17200","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ACE-Ego-0: Unifying Egocentric Human and Robotic Data for VLA Pretraining","primary_cat":"cs.RO","submitted_at":"2026-06-15T18:40:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ACE-Ego-0 is a VLA pretraining framework that turns egocentric human videos into robot-format pseudo-actions via a video-to-action pipeline and trains jointly with robot data under a reliability-aware objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13279","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"See Selectively, Act Adaptively: Dual-Level Structural Decomposition for Bimanual Robot Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-11T12:33:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A VLA policy using view-selective visual routing and interaction-aware action MoE improves average success by 27.7% in simulation and 43.3% in real-world bimanual tasks over monolithic baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12402","ref_index":61,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"DIRECT: When and Where Should You Allocate Test-Time Compute in Embodied Planners?","primary_cat":"cs.RO","submitted_at":"2026-06-10T17:58:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DIRECT is a multimodal-context router that allocates test-time compute across chain-of-thought depth, model size, and memory history for VLM embodied planners, improving the success-cost Pareto frontier and matching stronger models at up to 65% lower latency on benchmarks and a physical Franka arm.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12497","ref_index":3,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"$\\mu$VLA: On Recurrent Memory for Partially Observable Manipulation in VLA Models","primary_cat":"cs.LG","submitted_at":"2026-06-10T13:26:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adding recurrent memory tokens to VLA models raises success rates on partially observable manipulation tasks from 0.42 to 0.84 on training and 0.07 to 0.23 on held-out tasks while preserving performance under full observability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09572","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CT-VAM: A Cerebello-Thalamic-Inspired Vision-Action Model for Efficient Visuomotor Control","primary_cat":"cs.RO","submitted_at":"2026-06-08T14:46:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CT-VAM is a 68M-parameter cerebello-thalamic-inspired model that achieves competitive LIBERO success rates with lower inference latency than larger VLA models by using a stream-separated attention decoder called TARS.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08414","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PACT: Self-Evolving Physical Safety Alignment for Diffusion Policies in Embodied Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-07T02:27:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PACT is a self-evolving post-training framework that projects diffusion policies onto constraint-feasible regions via reverse-KL distillation and a tightening curriculum, reporting 31% fewer safety violations and 30.7% higher task success on embodied manipulation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00020","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"EmbodimentSemantic: A Spatial Scene-Graph Dataset and Benchmark for Vision-Language Models on Embodied Manipulation Trajectories","primary_cat":"cs.RO","submitted_at":"2026-06-06T18:58:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EmbodimentSemantic is a spatial scene-graph dataset and benchmark for evaluating relational grounding in vision-language models on embodied manipulation trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05737","ref_index":9,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Let It Be Simple: One-Step Action Generation for Vision-Language-Action Models","primary_cat":"cs.CV","submitted_at":"2026-06-04T05:58:30+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Biasing the training time distribution toward high-noise states enables one-step action generation in VLA models that matches or exceeds ten-step decoding on LIBERO benchmarks and real-robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01865","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Set-Supervised Diffusion Policy: Learning Action-Chunking Diffusion through Corrections","primary_cat":"cs.RO","submitted_at":"2026-06-01T08:14:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SDP constructs sets of desired action-chunks from human correction pairs and trains diffusion policies to align with those sets, yielding better performance and robustness than standard behavior cloning on robotic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00229","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Continuous Reasoning for Vision-Language-Action","primary_cat":"cs.RO","submitted_at":"2026-05-29T18:02:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Continuous Reasoning for VLA introduces a shared Gaussian latent for continuous thoughts, trained with self-verification to improve action prediction on LIBERO-PRO and real robots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27817","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Turning Video Models into Generalist Robot Policies","primary_cat":"cs.RO","submitted_at":"2026-05-27T01:21:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decouples action-free video world models from embodiment-specific IDMs using Jacobian-based translation to achieve zero-shot cross-embodiment robot policies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.08167","ref_index":7,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Self-Supervised Bootstrapping of Action-Predictive Embodied Reasoning","primary_cat":"cs.RO","submitted_at":"2026-02-09T00:10:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"R&B-EnCoRe uses self-supervised importance-weighted variational inference to distill action-predictive reasoning datasets that improve VLA performance on manipulation, navigation, and driving tasks without external verifiers.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"In Pulkit Agrawal, Oliver Kroemer, and Wol- fram Burgard, editors,Proceedings of The 8th Confer- ence on Robot Learning, volume 270 ofProceedings of Machine Learning Research, pages 2679-2713. PMLR, 06-09 Nov 2025. [6] Suneel Belkhale and Dorsa Sadigh. Minivla: A better vla with a smaller footprint, 2024. URL https://github. com/Stanford-ILIAD/openvla-mini. [7] Kevin Black, Noah Brown, Danny Driess, Adnan Es- mail, Michael Robert Equi, Chelsea Finn, Niccolo Fusai, Lachy Groom, Karol Hausman, Brian Ichter, Szymon Jakubczak, Tim Jones, Liyiming Ke, Sergey Levine, Adrian Li-Bell, Mohith Mothukuri, Suraj Nair, Karl Pertsch, Lucy Xiaoyang Shi, Laura Smith, James Tanner, Quan Vuong, Anna Walling, Haohuan Wang,"}],"limit":50,"offset":0}