{"total":14,"items":[{"citing_arxiv_id":"2606.15753","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RoboPIN: Grounded Embodied Reasoning via Pinned Chain-of-Thought","primary_cat":"cs.AI","submitted_at":"2026-06-14T11:30:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces PinCoT paradigm with visual reasoning anchors, builds PIN-170K dataset via automated pipeline, and trains 4B RoboPIN model via three-stage post-training to outperform 7B baselines by 12% on embodied reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13497","ref_index":69,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SPARC: Reliable Spatial Annotations from Robot Demonstrations at Scale","primary_cat":"cs.RO","submitted_at":"2026-06-11T15:46:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPARC generates reliable spatial annotations for robot demonstrations by leveraging spatio-temporal task structure, outperforming detection baselines on localization accuracy while retaining more samples and enabling competitive model performance without manual annotations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13040","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RoboProcessBench: Benchmarking Process-Aware Understanding in Vision-Language Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-11T08:20:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoboProcessBench is a new benchmark decomposing process-aware understanding into static monitoring and dynamic reasoning across 12 question families, with evaluations showing VLM limitations but post-training gains on the provided data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11324","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Embodied-R1.5: Evolving Physical Intelligence via Embodied Foundation Models","primary_cat":"cs.RO","submitted_at":"2026-06-09T18:07:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Embodied-R1.5 is an 8B EFM achieving SOTA on 16 of 24 embodied VLM benchmarks, fine-tunable to outperform leading VLAs, with claimed zero-shot real-robot generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03890","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OVO-S-Bench: A Hierarchical Benchmark for Streaming Spatial Intelligence in Multimodal LLMs","primary_cat":"cs.CV","submitted_at":"2026-06-02T16:51:32+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OVO-S-Bench provides 1680 human-annotated questions on 348 videos to measure streaming spatial intelligence in MLLMs across instantaneous perception, spatiotemporal tracking, spatial simulation, and allocentric mapping.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28548","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GEM: Generative Supervision Helps Embodied Intelligence","primary_cat":"cs.CV","submitted_at":"2026-05-27T14:39:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GEM adds generative depth supervision to VLM pre-training and reports improved results on embodied benchmarks plus real-world robot execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25813","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Extending Embodied Question Answering from Perception to Decision","primary_cat":"cs.RO","submitted_at":"2026-05-25T13:08:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces EQA-Decision dataset with 4M+ QA pairs across four embodied reasoning dimensions and RoboDecision baseline for joint perception-reasoning-decision evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22536","ref_index":5,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SpaceDG: Benchmarking Spatial Intelligence under Visual Degradation","primary_cat":"cs.CV","submitted_at":"2026-05-21T14:25:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpaceDG is the first large-scale benchmark dataset (~1M QA pairs) simulating nine visual degradations in 3DGS-rendered scenes to measure and improve spatial intelligence robustness in MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16713","ref_index":30,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeoWorld-VLM: Geometry from World Models for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-15T23:52:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoWorld-VLM aligns VLM image features with intermediate representations from camera-conditioned world models via fine-tuning only the encoder and projector, yielding ~4% gains on What'sUp and VSR spatial benchmarks across two VLM backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08645","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"3D-VCD: Hallucination Mitigation in 3D-LLM Embodied Agents through Visual Contrastive Decoding","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:57:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"3D-VCD reduces hallucinations in 3D-LLM embodied agents by contrasting predictions from original and distorted 3D scene representations at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02870","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Token Warping Helps MLLMs Look from Nearby Viewpoints","primary_cat":"cs.CV","submitted_at":"2026-04-03T08:37:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Backward token warping in ViT-based MLLMs enables reliable reasoning from nearby viewpoints by preserving semantic coherence better than pixel-wise warping or fine-tuning baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[56] HaotianLiu, ChunyuanLi, QingyangWu, andYongJaeLee. Visual instruction tuning. InNeurIPS, 2023. 3 [57] YuechengLiu,DafengChi,ShiguangWu,ZhanguangZhang, Yaochen Hu, Lingfeng Zhang, Yingxue Zhang, Shuang Wu, Tongtong Cao, Guowei Huang, et al. Spatialcot: Advancing spatial reasoning through coordinate alignment and chain- of-thought for embodied task planning.arXiv preprint arXiv:2501.10074, 2025. 2 [58] Gen Luo, Ganlin Yang, Ziyang Gong, Guanzhou Chen, Hao- nan Duan, Erfei Cui, Ronglei Tong, Zhi Hou, Tianyi Zhang, Zhe Chen, et al. Visual embodied brain: Let multimodal large language models see, think, and control in spaces. arXiv preprint arXiv:2506.00123, 2025. 15, 16 [59] Chenyang Ma, Kai Lu, Ta-Ying Cheng, Niki Trigoni, and Andrew Markham. Spatialpin: Enhancing spatial reasoning"},{"citing_arxiv_id":"2511.16518","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MiMo-Embodied: X-Embodied Foundation Model Technical Report","primary_cat":"cs.RO","submitted_at":"2025-11-20T16:34:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiMo-Embodied is a single foundation model that achieves state-of-the-art results on 17 embodied AI benchmarks and 12 autonomous driving benchmarks through multi-stage learning, curated data, and CoT/RL fine-tuning that produces positive cross-domain transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.13778","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InternVLA-M1: A Spatially Guided Vision-Language-Action Framework for Generalist Robot Policy","primary_cat":"cs.RO","submitted_at":"2025-10-15T17:30:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVLA-M1 uses spatially guided pre-training on 2.3M examples followed by action post-training to deliver up to 17% gains on robot manipulation benchmarks and 20.6% on unseen objects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.12043","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Talk Less, Fly Lighter: Autonomous Semantic Compression for UAV Swarm Communication via LLMs","primary_cat":"cs.RO","submitted_at":"2025-08-16T13:37:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM-based autonomous semantic compression in four 2D UAV swarm simulations shows potential for efficient collaborative communication under bandwidth constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}