{"total":14,"items":[{"citing_arxiv_id":"2606.30613","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sequential Planning via Anchored Robotic Keypoints","primary_cat":"cs.RO","submitted_at":"2026-06-29T17:48:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPARK reaches 43.7% success on six LIBERO-PRO cells by LLM-generated typed behavior trees plus multi-prompt perception and recovery, more than doubling CaP-Agent0 and VLA baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30552","ref_index":23,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training Vision-Language-Action Models with Dense Embodied Chain-of-Thought Supervision","primary_cat":"cs.RO","submitted_at":"2026-06-29T16:48:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZR-0 is a dual-stream VLA model trained with dense ECoT supervision on 60M frames from 400K trajectories to enable cross-embodiment transfer in simulation and real-world settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27375","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scalable Behavior Cloning with Open Data, Training, and Evaluation","primary_cat":"cs.RO","submitted_at":"2026-06-25T17:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Releases the largest open teleoperation dataset for robot manipulation together with hardware, simulation, and training infrastructure to support scalable behavior cloning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27295","ref_index":13,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LA4VLA: Learning to Act without Seeing via Language-Action Pretraining","primary_cat":"cs.RO","submitted_at":"2026-06-25T17:13:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LA4VLA creates a 33K language-action dataset from existing demos and shows that pretraining on language-action pairs before or alongside vision-language-action training boosts success rates in sim and real robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21386","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLA-FAIL: Efficient Task Failure Detection for Finetuned Vision-Language-Action Models","primary_cat":"cs.LG","submitted_at":"2026-06-19T12:51:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLA-FAIL introduces last-layer Mahalanobis distance and action chunk consistency detectors that together enable early, reliable failure detection in finetuned VLAs without failure data or expensive sampling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21372","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NAC: Neural Action Codec for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-19T12:24:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NAC adapts multi-scale RVQGAN audio codecs with kinematic-specific losses to produce ordered action tokens that yield lower reconstruction error and higher task success than prior tokenizers in VLA models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20521","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HumanScale: Egocentric Human Video Can Outperform Real-Robot Data for Embodied Pretraining","primary_cat":"cs.CV","submitted_at":"2026-06-18T17:37:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Processed egocentric human video outperforms teleoperated real-robot trajectories as pretraining data for embodied foundation models, delivering 24% lower validation loss and 52.5-90% higher task success rates under matched post-training protocols.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18558","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MolmoMotion: Forecasting Point Trajectories in 3D with Language Instruction","primary_cat":"cs.CV","submitted_at":"2026-06-17T00:19:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a new task of goal-conditioned 3D point motion forecasting along with a 1.16M-video dataset, a 111-category benchmark, and a model that outperforms baselines while transferring to robotics and video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18363","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Guava: An Effective and Universal Harness for Embodied Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-16T18:09:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Guava harness enables 4B open-source models to achieve performance comparable to frontier models on embodied manipulation tasks by distilling capabilities from under 2K simulation trajectories using three identified design principles.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17846","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Qwen-RobotManip Technical Report: Alignment Unlocks Scale for Robotic Manipulation Foundation Models","primary_cat":"cs.RO","submitted_at":"2026-06-16T12:14:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Qwen-RobotManip applies unified alignment across representation, motion, and behavior to enable large-scale training on heterogeneous manipulation data, yielding emergent generalization on out-of-distribution robotic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13497","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPARC: Reliable Spatial Annotations from Robot Demonstrations at Scale","primary_cat":"cs.RO","submitted_at":"2026-06-11T15:46:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPARC generates reliable spatial annotations for robot demonstrations by leveraging spatio-temporal task structure, outperforming detection baselines on localization accuracy while retaining more samples and enabling competitive model performance without manual annotations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12299","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning What to Say to Your VLA: Mostly Harmless Vision Language Action Model Steering","primary_cat":"cs.RO","submitted_at":"2026-06-10T16:34:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A search-and-distill framework with conformalized improvement head produces a language feedback policy that boosts frozen VLA performance by 24.7% in simulation and 65% on hardware while guaranteeing harmlessness on perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07723","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VoLo: A Physical Orchestrator for Open-Vocabulary Long-Horizon Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-05T16:21:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VoLoAgent uses a VLM to steer heterogeneous robot capabilities as interruptible tools for long-horizon manipulation and introduces the RoboVoLo benchmark, claiming substantial outperformance over single VLA/VLM or tool-based systems with real-robot validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07107","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Coarse-to-Control: Action-Token Planning for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-06-05T10:01:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Coarse-to-Control adds planning via coarse action tokens in the same vocabulary as control actions, improving VLA performance on long-horizon manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}