{"total":38,"items":[{"citing_arxiv_id":"2606.31830","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PriorEye: Geospatial Visual Priors for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-06-30T15:36:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PriorEye augments end-to-end driving models with a dual-memory architecture that stores and gates geospatial visual priors to improve performance and robustness to sensor corruption on NAVSIM-v2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03159","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NVIDIA OmniDreams: Real-Time Generative World Model for Closed-Loop Autonomous Vehicle Simulation","primary_cat":"cs.CV","submitted_at":"2026-06-02T05:11:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"OmniDreams is a real-time generative world model mid- and post-trained from the Cosmos diffusion model on 21k hours of driving data to autoregressively generate action-conditioned videos for closed-loop AV simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02774","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoDrive-Bench: Benchmarking Region-Specific Multimodal Reasoning in Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-06-01T18:36:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GeoDrive-Bench is a new multimodal benchmark and distillation method for testing and improving VLMs on region-specific traffic-rule reasoning in autonomous driving across six countries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01036","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: Good Embodied Reward Models Need Bad Behavior Data","primary_cat":"cs.RO","submitted_at":"2026-05-31T05:56:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Embodied reward models systematically over-reward unsafe, suboptimal, and shortcut robot behaviors due to training on successful data only, and modest inclusion of bad behavior data improves alignment with human preferences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00267","ref_index":125,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StressDream: Steering Video World Models for Robust Policy Evaluation and Improvement","primary_cat":"cs.CV","submitted_at":"2026-05-29T18:57:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StressDream optimizes initial noise in diffusion video world models using VLM semantic and plausibility objectives to steer generations toward specified high-impact outcomes for improved policy evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31572","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"nuReasoning: A Reasoning-Centric Dataset and Benchmark for Long-Tail Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:40:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"nuReasoning is a new real-world dataset and benchmark extending nuScenes/nuPlan with 20k clips and multi-type reasoning annotations to evaluate and improve reasoning in long-tail autonomous driving.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31041","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Does Visual Information Play a Decisive Role in Vision-Language-Action Model Driving Behavior?","primary_cat":"cs.CV","submitted_at":"2026-05-29T09:18:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A structured perturbation framework applied to VLA driving models reveals evaluation-dependent visual grounding patterns and uneven dependency across abstraction levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29114","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReasonBreak: Probing Vulnerabilities in Reasoning-Enabled Vision-Language-Action Models for Autonomous Driving","primary_cat":"cs.CR","submitted_at":"2026-05-27T21:21:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReasonBreak demonstrates up to 89% attack success on reasoning and 72% on trajectories in NVIDIA Alpamayo VLA models via black-box textual perturbations, introducing a reasoning-aware evaluation framework and benchmark for autonomous driving.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28544","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DriveWAM: Video Generative Priors Enable Scalable World-Action Modeling for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-27T14:36:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DriveWAM converts video generative priors into a unified video-action policy for driving, reporting strong benchmark performance and positive scaling from 4k to 100k clips.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27589","ref_index":65,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What-If World: A Causal Benchmark for General World Models in Embodied Scenarios","primary_cat":"cs.CV","submitted_at":"2026-05-26T19:02:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"What-If World is a new paired-prompt benchmark showing that nine state-of-the-art video generation models achieve at most 52% on causal intervention tests and cluster near 28% for open-source systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00104","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PEACE: A Planner-Executor Agent with Constraint Enforcement for UAVs","primary_cat":"cs.RO","submitted_at":"2026-05-26T10:03:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PEACE decouples single-pass LLM planning from PX4 execution via ROS 2 and a constraint layer, with modular 3D perception, and shows feasibility in Gazebo SITL with improved explainability and fewer LLM calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22504","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LACO: Adaptive Latent Communication for Collaborative Driving","primary_cat":"cs.AI","submitted_at":"2026-05-21T13:54:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LACO introduces Iterative Latent Deliberation, Cross-Horizon Saliency Attribution, and Structured Semantic Knowledge Distillation to enable low-latency latent communication in collaborative driving while preserving performance in CARLA simulations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22089","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LVDrive: Latent Visual Representation Enhanced Vision-Language-Action Autonomous Driving Model","primary_cat":"cs.CV","submitted_at":"2026-05-21T07:31:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LVDrive improves closed-loop driving on Bench2Drive by adding latent future scene prediction to VLA models via unified embedding space processing and two-stage trajectory decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21917","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAVEN: A Multi-stage Agentic Annotation Pipeline for Video Reasoning Tasks","primary_cat":"cs.CV","submitted_at":"2026-05-21T02:44:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MAVEN pipeline generates multi-scale spatio-temporal event descriptions from videos using agentic adaptation and refinement, then produces training data that lets a fine-tuned 8B model outperform Gemini baselines on private CCTV and AccidentBench tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21139","ref_index":51,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Distill to Think, Foresee to Act: Cognitive-Physical Reinforcement Learning for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:14:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoPhy is a new RL framework that distills VLM cognition into BEV encoders, adds an auto-regressive BEV world model for action-conditioned future prediction, and optimizes policies via GRPO with dual physical-cognitive rewards, claiming SOTA on NAVSIM v1/v2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21061","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounding Driving VLA via Inverse Kinematics","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:45:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"By adding future visual state prediction and a dedicated inverse kinematics diffusion network that uses only visual boundary conditions, a 0.5B driving VLA recovers visual grounding and matches 7-8B models on NAVSIM-v2 and nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19120","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CosFly: Plan in the Matrix, Fly in the World","primary_cat":"cs.RO","submitted_at":"2026-05-18T21:11:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CosFly introduces a box-structured planning and multimodal simulation pipeline for aerial target tracking in CARLA, paired with the public CosFly-Track dataset containing 250 trajectories and approximately 100,000 rendered multi-modal images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17284","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLAP: Contrastive Latent-space Prompt Optimization for End-to-end Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-17T06:45:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLAP reduces planning error on challenging driving scenarios by 24% on NAVSIM using contrastive latent-space prompt optimization on frozen VLA models with no regression on normal frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17268","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Is VLA Reasoning Faithful? Probing Safety of Chain-of-Causation in Autonomous Driving Models","primary_cat":"cs.AI","submitted_at":"2026-05-17T05:29:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLA driving models show 42.5% reasoning fidelity and 48.3% reasoning-action consistency, with 97.7% trajectory fragility under perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16737","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DriveSafer: End-to-End Autonomous Driving with Safety Guidance","primary_cat":"cs.RO","submitted_at":"2026-05-16T01:21:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DriveSafer reduces catastrophic failures (PDMS=0) by 48% and drivable-area compliance failures by over 65% versus DiffusionDrive on the NAVSIM benchmark by combining training-time safety constraints with inference-time guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13646","ref_index":51,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Causality-Aware End-to-End Autonomous Driving via Ego-Centric Joint Scene Modeling","primary_cat":"cs.RO","submitted_at":"2026-05-13T15:06:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CaAD adds ego-centric joint-causal modeling and causality-aware policy alignment to end-to-end driving, reporting Driving Score 87.53 and PDMS 91.1 on Bench2Drive and NAVSIM.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[49] Royden Wagner, Omer Sahin Tas, Felix Hauser, Marlon Steiner, Dominik Strutz, Abhishek Vivekanandan, Carlos Fernandez, and Christoph Stiller. Retromotion: Retrocausal motion forecasting models are instructable.arXiv preprint arXiv:2505.20414, 2025. [50] Royden Wagner, Omer Sahin Tas, Marvin Klemp, and Carlos Fernandez. Jointmotion: joint self-supervision for joint motion prediction.arXiv preprint arXiv:2403.05489, 2024. [51] Yan Wang, Wenjie Luo, Junjie Bai, Yulong Cao, Tong Che, Ke Chen, Yuxiao Chen, Jenna Dia- mond, Yifan Ding, Wenhao Ding, et al. Alpamayo-r1: Bridging reasoning and action prediction for generalizable autonomous driving in the long tail.arXiv preprint arXiv:2511.00088, 2025. [52] Xinshuo Weng, Boris Ivanovic, Yan Wang, Yue Wang, and Marco Pavone. Para-drive: Par-"},{"citing_arxiv_id":"2605.12624","ref_index":29,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MindVLA-U1: VLA Beats VA with Unified Streaming Architecture for Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-05-12T18:09:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MindVLA-U1 is the first unified streaming VLA architecture that surpasses human drivers on WOD-E2E planning metrics while matching VA latency and preserving language interfaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08975","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Latency Analysis and Optimization of Alpamayo 1 via Efficient Trajectory Generation","primary_cat":"cs.AI","submitted_at":"2026-05-09T14:34:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Redesigning Alpamayo 1 to single-reasoning and optimizing diffusion action generation cuts inference latency by 69.23% while preserving trajectory diversity and prediction quality.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"autonomous driving,\" arXiv preprint, 2025. [Online], Available: https: //arxiv.org/abs/2411.15139 [Accessed: March 25, 2026]. [7] Y . Wanget al., \"Alpamayo-R1: Bridging reasoning and action prediction for generalizable autonomous driving in the long tail,\" arXiv preprint, 2026. [Online], Available: https://arxiv.org/abs/2511.00088 [Accessed: March 25, 2026]. [8] J.-J. Hwanget al., \"EMMA: End-to-end multimodal model for au- tonomous driving,\" arXiv preprint, 2025. [Online], Available: https: //arxiv.org/abs/2410.2326 [Accessed: March 25, 2026]. [9] Z. Xuet al., \"DriveGPT4: Interpretable end-to-end autonomous driving via large language model,\"IEEE Robotics and Automation Letters, 2024. [10] J. Maoet al., \"GPT-Driver: Learning to drive with gpt,\" arXiv preprint,"},{"citing_arxiv_id":"2605.07514","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Is the Future Compatible? Diagnosing Dynamic Consistency in World Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-08T09:44:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Action-state consistency in World Action Models distinguishes successful from failed imagined futures and supports value-free selection of better rollouts via consensus among predictions.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"Hanlin Wang, Yinghao Xu, Shuailei Ma, et al. Advancing open-source world models.arXiv preprint arXiv:2601.20540, 2026. [34] Yalcin Tur, Jalal Naghiyev, Haoquan Fang, Wei-Chuan Tsai, Jiafei Duan, Dieter Fox, and Ranjay Krishna. Recurrent-depth vla: Implicit test-time compute scaling of vision-language-action models via latent iterative reasoning.arXiv preprint arXiv:2602.07845, 2026. [35] Yan Wang, Wenjie Luo, Junjie Bai, Yulong Cao, Tong Che, Ke Chen, Yuxiao Chen, Jenna Diamond, Yifan Ding, Wenhao Ding, et al. Alpamayo-r1: Bridging reasoning and action prediction for generalizable autonomous driving in the long tail.arXiv preprint arXiv:2511.00088, 2025. [36] Jialong Wu, Shaofeng Yin, Ningya Feng, and Mingsheng Long. RLVR-world: Training world models with"},{"citing_arxiv_id":"2604.22260","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Safe Mobility: A Unified Transportation Foundation Model enabled by Open-Ended Vision-Language Dataset","primary_cat":"cs.CV","submitted_at":"2026-04-24T06:09:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Creates LTD dataset for open-ended traffic VQA and trains UniVLT model to achieve SOTA on unified microscopic AD and macroscopic traffic reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21249","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reasoning About Traversability: Language-Guided Off-Road 3D Trajectory Planning","primary_cat":"cs.RO","submitted_at":"2026-04-23T03:26:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A language refinement framework with geometry-aware preference optimization lets VLMs generate more traversable 3D trajectories for off-road vehicles, yielding modest gains in error, traversability compliance, and elevation consistency on the ORAD-3D benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19710","ref_index":69,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SpanVLA: Efficient Action Bridging and Learning from Negative-Recovery Samples for Vision-Language-Action Model","primary_cat":"cs.CV","submitted_at":"2026-04-21T17:34:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SpanVLA reduces action generation latency via flow-matching conditioned on history and improves robustness by training on negative-recovery samples with GRPO and a dedicated reasoning dataset.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"action bridging and learned from real-world negative-recovery samples, as illus- trated in Fig. 1. To overcome the linearly increasing latency of autoregressive decoding with respect to action length, we introduce an efficient action bridging with a flow-matching action expert. First, unlike prior designs that rely solely on the final-layer [15,40] or dense full-layer features [68,69], our efficient action bridging aggregates multi-granular features from multiple sparse layers of the VLM, capturing different levels of information from raw vision to final reason- ing. Then, based on the extracted feature, we introduce a flow-matching-based action expert to generate high-frequency, multi-modal trajectories. Instead of learning the flow directly from random noise [40,68], our formulation condi-"},{"citing_arxiv_id":"2604.18486","ref_index":104,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Xiaomi OneVL: One-Step Latent Reasoning and Planning with Vision-Language Explanation","primary_cat":"cs.CV","submitted_at":"2026-04-20T16:37:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OneVL achieves superior accuracy to explicit chain-of-thought reasoning at answer-only latency by supervising latent tokens with a visual world model decoder that predicts future frames.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Closer to trajectory prediction, recent VLA models pair language reasoning with waypoint or action outputs [68, 69, 115]. DriveVLA-W0 [61] employs world modeling to generate dense self-supervised signals that amplify data scaling laws in VLA-based driving. AdaThinkDrive [78] introduces adaptive CoT for driving decisions, LaST-VLA [77] trains a large vision-language-action model on driving data, and Alpamayo-R1 [104] explicitly bridges reasoning traces with long-tail action prediction. OneVL builds on these foundations by addressing the latency cost of explicit CoT through dual-modal latent supervision and prefill inference, delivering competitive performance without sacrificing interpretability. 2.3 World Modeling for Autonomous Driving The concept of the world model originates from model-based reinforcement learning, where it seeks to"},{"citing_arxiv_id":"2604.17176","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Intent-aligned Autonomous Spacecraft Guidance via Reasoning Models","primary_cat":"eess.SY","submitted_at":"2026-04-19T00:25:54+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10856","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BridgeSim: Unveiling the OL-CL Gap in End-to-End Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-04-12T23:37:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The primary OL-CL gap in end-to-end autonomous driving arises from objective mismatch creating structural inability to model reactive behaviors, which a test-time adaptation method can mitigate.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Autonomous Intelligent Systems, 1(1): 11, 2021. [58] R. C. Coulter. Implementation of the pure pursuit path tracking algorithm. Technical report, 1992. [59] K. Li, Z. Li, S. Lan, Y . Xie, Z. Zhang, J. Liu, Z. Wu, Z. Yu, and J. M. Alvarez. Hydra- mdp++: Advancing end-to-end driving via expert-guided hydra-distillation.arXiv preprint arXiv:2503.12820, 2025. [60] Y . Wang, W. Luo, J. Bai, Y . Cao, T. Che, K. Chen, Y . Chen, J. Diamond, Y . Ding, W. Ding, et al. Alpamayo-r1: Bridging reasoning and action prediction for generalizable autonomous driving in the long tail.arXiv preprint arXiv:2511.00088, 2025. [61] comma.ai. openpilot.https://github.com/commaai/openpilot, 2026. [62] W. Peebles and S. Xie. Scalable diffusion models with transformers."},{"citing_arxiv_id":"2604.08266","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Orion-Lite: Distilling LLM Reasoning into Efficient Vision-Only Driving Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T13:51:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Orion-Lite uses latent feature distillation and trajectory supervision to create a vision-only model that surpasses its LLM-based teacher on closed-loop Bench2Drive evaluation, achieving a new SOTA driving score of 80.6.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08031","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Open-Ended Instruction Realization with LLM-Enabled Multi-Planner Scheduling in Autonomous Vehicles","primary_cat":"cs.RO","submitted_at":"2026-04-09T09:32:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM-driven multi-planner scheduling framework turns open-ended passenger instructions into safe, traceable control signals for autonomous vehicles while cutting query costs and matching specialized safety levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03497","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sim2Real-AD: A Modular Sim-to-Real Framework for Deploying VLM-Guided Reinforcement Learning in Real-World Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-04-03T22:41:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sim2Real-AD enables zero-shot transfer of CARLA-trained VLM-guided RL policies to full-scale vehicles, reporting 75-90% success rates in car-following, obstacle avoidance, and stop-sign scenarios without real-world RL training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02714","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ExploreVLA: Dense World Modeling and Exploration for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-04-03T04:14:13+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"detection and planning within a single VLM, and DiffVLA [21] combines diffusion- based trajectory sampling with language-conditioned embeddings. Most recently, reasoning-augmented VLA models have pushed the frontier further. ORION [11] incorporates a transformer memory module for long-horizon reasoning, and Au- toVLA [58] fuses CoT reasoning and trajectory planning in a single autoregressive transformer. Alpamayo-R1 [39] introduces causally grounded Chain-of-Causation reasoning tightly integrated with trajectory prediction, enhancing reasoning- action consistency and long-tail safety performance. Despite this rapid progress, most existing VLA models rely on textual descriptions and action trajectories as the primary supervisory signal, which are inherently sparse."},{"citing_arxiv_id":"2603.00696","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DRIV-EX: Counterfactual Explanations for Driving LLMs","primary_cat":"cs.CL","submitted_at":"2026-02-28T15:12:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIV-EX generates fluent counterfactual scene descriptions by using gradient-optimized embeddings only as a guide for controlled text decoding, producing more reliable explanations than baselines on transcribed highD driving data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.22801","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unleashing the Potential of Diffusion Models for End-to-End Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-02-26T09:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces Hyper Diffusion Planner (HDP), a diffusion-based E2E AD framework that identifies insights on loss space, trajectory representation and data scaling, adds RL post-training, and reports 10x performance gains over 200 km of real-world testing across 6 scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.10226","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Latent Chain-of-Thought World Modeling for End-to-End Driving","primary_cat":"cs.CV","submitted_at":"2025-12-11T02:22:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LCDrive unifies chain-of-thought reasoning and action selection for end-to-end driving by interleaving action-proposal tokens and latent world-model tokens that predict action outcomes, yielding faster inference and better trajectories than text-based or non-reasoning baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.15925","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VERDI: VLM-Embedded Reasoning for Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2025-05-21T18:24:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VERDI aligns perception, prediction, and planning outputs of end-to-end AD models with VLM-generated text features at training time to embed structured reasoning, yielding up to 11% better l2 distance and 10% higher non-collision rate in closed-loop tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}