{"total":36,"items":[{"citing_arxiv_id":"2605.31041","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Does Visual Information Play a Decisive Role in Vision-Language-Action Model Driving Behavior?","primary_cat":"cs.CV","submitted_at":"2026-05-29T09:18:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A structured perturbation framework applied to VLA driving models reveals evaluation-dependent visual grounding patterns and uneven dependency across abstraction levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27038","ref_index":82,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TPS-Drive: Task-Guided Representation Purification for VLM-based Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-05-26T13:56:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TPS-Drive uses an agent-centric tokenizer supervised by a frozen 3D detection head to purify VLM spatial representations, enabling better scene forecasting and lower collision rates on nuScenes and NAVSIM benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26113","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnyScene: Towards Highly Controllable Driving Scene Generation at Anywhere and Beyond","primary_cat":"cs.RO","submitted_at":"2026-05-25T17:59:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnyScene is an occupancy-centric framework using a Spatial-Temporal Occupancy Diffusion Transformer and Geometry-Grounded View Expansion to generate controllable driving scenes and videos from BEV layouts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22504","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LACO: Adaptive Latent Communication for Collaborative Driving","primary_cat":"cs.AI","submitted_at":"2026-05-21T13:54:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LACO introduces Iterative Latent Deliberation, Cross-Horizon Saliency Attribution, and Structured Semantic Knowledge Distillation to enable low-latency latent communication in collaborative driving while preserving performance in CARLA simulations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22089","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LVDrive: Latent Visual Representation Enhanced Vision-Language-Action Autonomous Driving Model","primary_cat":"cs.CV","submitted_at":"2026-05-21T07:31:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LVDrive improves closed-loop driving on Bench2Drive by adding latent future scene prediction to VLA models via unified embedding space processing and two-stage trajectory decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21061","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounding Driving VLA via Inverse Kinematics","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:45:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"By adding future visual state prediction and a dedicated inverse kinematics diffusion network that uses only visual boundary conditions, a 0.5B driving VLA recovers visual grounding and matches 7-8B models on NAVSIM-v2 and nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19524","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeAlign-VLA: A Negative-Enhanced Safe Alignment Framework for Risk-Aware Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-05-19T08:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SafeAlign-VLA uses counterfactual safety pairing and anchor-based group relative policy optimization to incorporate negative data for safer VLA-based autonomous driving.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17284","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLAP: Contrastive Latent-space Prompt Optimization for End-to-end Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-17T06:45:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLAP reduces planning error on challenging driving scenarios by 24% on NAVSIM using contrastive latent-space prompt optimization on frozen VLA models with no regression on normal frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15120","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLOVER: Closed-Loop Value Estimation and Ranking for End-to-End Autonomous Driving Planning","primary_cat":"cs.RO","submitted_at":"2026-05-14T17:32:18+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLOVER is a closed-loop generator-scorer framework that expands proposal coverage with pseudo-expert trajectories and performs conservative self-distillation to achieve state-of-the-art planning scores on NAVSIM and nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14696","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EponaV2: Driving World Model with Comprehensive Future Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-14T11:12:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EponaV2 advances perception-free driving world models by forecasting comprehensive future 3D geometry and semantic representations, achieving SOTA planning performance on NAVSIM benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"aware register tokens, and both RAP [14] and SimScale [69] propose scalable data augmentation pipelines. Recently, the success of VLMs has proven their abilities for different tasks [ 2], and fine-tuning VLMs to Vision-Language-Action (VLA) models for autonomous driving tasks might be a possible way [ 70, 15, 7, 34, 86, 64, 51, 31, 87, 60, 19, 47, 55, 32, 33, 93, 74, 52, 82, 79, 66]. Notable examples include AutoVLA [ 98], ReCogDrive [38], and AutoDrive-P3 [84], which perceive environments via a question-and-answer format and improve planning through DPPO [57]. Furthermore, VGGDrive [71] integrates 3D-aware models to enhance the geometric perception of VLA frameworks. Despite these advancements, perception-based models rely heavily on labor- intensive annotations, which hinders their ability to scale with massive datasets."},{"citing_arxiv_id":"2605.13646","ref_index":25,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Causality-Aware End-to-End Autonomous Driving via Ego-Centric Joint Scene Modeling","primary_cat":"cs.RO","submitted_at":"2026-05-13T15:06:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CaAD adds ego-centric joint-causal modeling and causality-aware policy alignment to end-to-end driving, reporting Driving Score 87.53 and PDMS 91.1 on Bench2Drive and NAVSIM.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This enables the planner to reason over interactive futures while maintaining the computational efficiency of planning-oriented E2E architectures. Supervised imitation alone does not directly optimize closed-loop driving quality, since pointwise trajectory losses may underweight key objectives such as safety, map compliance, comfort, and long- horizon progress [14, 25]. Accordingly, although some methods [25, 39, 43] further refine the ego policy with reward or preference signals, such refinement remains causality-limited, as it is still built on marginal or ego-focused planning representations. In contrast, as shown in Step 2 of Fig. 1(b), our method performs causality-aware policy alignment on ego-centric joint-mode embeddings, allowing reinforcement learning to refine an ego policy that has already been shaped by joint-causal modeling."},{"citing_arxiv_id":"2605.12624","ref_index":27,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MindVLA-U1: VLA Beats VA with Unified Streaming Architecture for Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-05-12T18:09:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MindVLA-U1 is the first unified streaming VLA architecture that surpasses human drivers on WOD-E2E planning metrics while matching VA latency and preserving language interfaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10426","ref_index":50,"ref_count":4,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CoWorld-VLA: Thinking in a Multi-Expert World Model for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-11T12:01:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoWorld-VLA extracts semantic, geometric, dynamic, and trajectory expert tokens from multi-source supervision and feeds them into a diffusion-based hierarchical planner, achieving competitive collision avoidance and trajectory accuracy on the NAVSIM v1 benchmark.","context_count":2,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"prediction as textual reasoning or generation. However, recent studies show that lengthy textual reasoning may increase inference latency and weaken critical visual information [ 22, 23, 50-52]. To address these issues, recent work explores latent reasoning in continuous latent spaces [24-27]. Representative methods include DriveMoE [53], ReCogDrive [50], and LaST-VLA [54]. Neverthe- less, existing latent reasoning approaches still lack sufficient physical and semantic constraints for structured planning representations. World models for autonomous driving.World models are widely used in autonomous driving to capture spatio-temporal dynamics and predict future scene evolution [ 31, 40, 55, 56]. Early"},{"citing_arxiv_id":"2605.09701","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DriveFuture: Future-Aware Latent World Models for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-10T18:45:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DriveFuture achieves SOTA results on NAVSIM by conditioning latent world model states on future predictions to directly inform trajectory planning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"costly pixel-level generation. Second, latent representations can abstract away low-level visual details and focus on planning-relevant scene structure, interactions, and action consequences. Third, their temporally structured representations are naturally suited for long-horizon planning. This line of work has rapidly evolved from latent future prediction [17] to intention-aware latent planning [18], planning-oriented representation refinement with reinforcement fine-tuning [19], and more recent unifications with VLA/planners and policy scaling [20-23]. These developments indicate that latent world models are no longer merely an efficient substitute for observation-space simulation, but are increasingly becoming a general representation substrate for scalable autonomous driving."},{"citing_arxiv_id":"2605.08830","ref_index":29,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VECTOR-Drive: Tightly Coupled Vision-Language and Trajectory Expert Routing for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-09T09:34:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VECTOR-DRIVE uses shared self-attention with semantic-aware expert routing of tokens to VL and trajectory experts plus flow-matching action decoding to reach 88.91 driving score on Bench2Drive.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"sual observations, navigation cues, ego states, and language prompts, followed by a trajectory or control head [13], [15]- [17]. Recent works further improve driving-oriented semantic representations. FLARE learns future-aware latent representa- tions from VLM features, while ReCogDrive combines VLM- based driving cognition with a diffusion planner for reasoning- guided planning [28], [29]. These studies demonstrate the value of VLM priors for long-tail scene understanding and interpretable driving decisions. However, most VLA driving models still rely on largely shared computation for language reasoning and motion predic- tion. Although shared backbones preserve multimodal interac- tion, the same transformer and FFN parameters must support"},{"citing_arxiv_id":"2605.04647","ref_index":112,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ReflectDrive-2: Reinforcement-Learning-Aligned Self-Editing for Discrete Diffusion Driving","primary_cat":"cs.RO","submitted_at":"2026-05-06T08:52:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReflectDrive-2 combines masked discrete diffusion with RL-aligned self-editing to generate and refine driving trajectories, reaching 91.0 PDMS on NAVSIM camera-only and 94.8 in best-of-6.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04470","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CRAFT: Counterfactual-to-Interactive Reinforcement Fine-Tuning for Driving Policies","primary_cat":"cs.LG","submitted_at":"2026-05-06T03:49:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CRAFT is an on-policy RL fine-tuning framework that decomposes closed-loop policy gradients into a group-normalized counterfactual proxy plus residual correction from interaction events, achieving top closed-loop performance on Bench2Drive across multiple driving architectures.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"proxy, preserving grounded feedback without estimating full advantages from sparse rollouts. Counterfactual Post-Training in Autonomous Driving.Counterfactual post-training improves learning efficiency through dense local comparisons on real visited states [32], making preference- based and group-relative objectives attractive [8, 33]. Recent methods combine PDM Score [34, 35, 36, 37, 38, 39] with GRPO in NA VSIM [40] for policy fine-tuning, or use synthetic counterfactual data [41, 42] for offline post-training. A key limitation lies in the rollout condition, since counterfac- tual futures generated offline or in a non-reactive simulator do not capture true closed-loop interaction with the environment. CRAFT uses counterfactual futures as a dense proxy on real visited states,"},{"citing_arxiv_id":"2604.28111","ref_index":12,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GSDrive: Reinforcing Driving Policies by Multi-mode Future Trajectory Probing with 3D Gaussian Splatting Environment","primary_cat":"cs.RO","submitted_at":"2026-04-30T16:59:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GSDrive combines IL priors with RL feedback by probing multi-mode futures inside a 3D Gaussian Splatting simulator to supply dense rewards for closed-loop driving policy improvement on nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22260","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards Safe Mobility: A Unified Transportation Foundation Model enabled by Open-Ended Vision-Language Dataset","primary_cat":"cs.CV","submitted_at":"2026-04-24T06:09:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Creates LTD dataset for open-ended traffic VQA and trains UniVLT model to achieve SOTA on unified microscopic AD and macroscopic traffic reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19710","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SpanVLA: Efficient Action Bridging and Learning from Negative-Recovery Samples for Vision-Language-Action Model","primary_cat":"cs.CV","submitted_at":"2026-04-21T17:34:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SpanVLA reduces action generation latency via flow-matching conditioned on history and improves robustness by training on negative-recovery samples with GRPO and a dedicated reasoning dataset.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"T able 1:Comparison with SOTA methods on theNA VSIM v1(navtest). PDMS (Predictive Driver Model Score), NC (No Collision), DAC (Drivable Area Compliance), EP (Ego Process), TTC (Time-To-Collision), Comf. (Comfort), Methods Cam. Lid. PDMS↑ NC↑DAC↑EP↑TTC↑Comf.↑ Conventional End-to-end-based Methods TransFuser [6] ✓ ✓ 84.0 97.8 92.6 78.9 92.9100.0 DRAMA [76] ✓ ✓ 86.9 98.2 95.2 81.3 94.2100.0 Hydra-MDP [41] ✓ ✓ 86.5 98.3 96.0 78.7 94.6100.0 DiffusionDrive [42] ✓ ✓ 88.1 98.2 96.2 82.2 94.7100.0 WoTE [39] ✓ ✓ 88.3 98.5 96.8 81.9 94.4 99.9 VLA-based Methods ReCogDrive [38] ✓- 89.6 98.2 97.8 83.5 95.2 99.8 DriveVLA-W0 [38] ✓- 90.2 98.799.183.3 95.3 99.3 AutoVLA [38] ✓- 89.1 98.4 95.6 81.998.099.9 Ours SpanVLA (One-shot) ✓- 82.1 97.5 90.8 76.9 93.7 99.5 SpanVLA (Post-RFT) ✓- 90."},{"citing_arxiv_id":"2604.18486","ref_index":62,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Xiaomi OneVL: One-Step Latent Reasoning and Planning with Vision-Language Explanation","primary_cat":"cs.CV","submitted_at":"2026-04-20T16:37:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OneVL achieves superior accuracy to explicit chain-of-thought reasoning at answer-only latency by supervising latent tokens with a visual world model decoder that predicts future frames.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"InIEEE/CVF International Conference on Computer Vision, pages 27137-27146, 2025. 26 [61] Yingyan Li, Shuyao Shang, Weisong Liu, Bing Zhan, Haochen Wang, Yuqi Wang, Yuntao Chen, Xiaoman Wang, Yasong An, Chufeng Tang, Lu Hou, Lue Fan, and Zhaoxiang Zhang. DriveVLA-W0: World models amplify data scaling law in autonomous driving. InInternational Conference on Learning Representations, 2026. [62] Yongkang Li, Kaixin Xiong, Xiangyu Guo, Fang Li, Sixu Yan, Gangwei Xu, Lijun Zhou, Long Chen, Haiyang Sun, Bing Wang, Kun Ma, Guang Chen, Hangjun Ye, Wenyu Liu, and Xinggang Wang. ReCogDrive: A reinforced cognitive framework for end-to-end autonomous driving.arXiv preprint arXiv:2506.08052, 2025. [63] Alan Liang, Youquan Liu, Yu Yang, Dongyue Lu, Linfeng Li, Lingdong Kong, Huaici Zhao, and Wei Tsang"},{"citing_arxiv_id":"2604.17915","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OneDrive: Unified Multi-Paradigm Driving with Vision-Language-Action Models","primary_cat":"cs.CV","submitted_at":"2026-04-20T07:50:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OneDrive unifies heterogeneous decoding in a single VLM transformer decoder for end-to-end driving, achieving 0.28 L2 error and 0.18 collision rate on nuScenes plus 86.8 PDMS on NAVSIM.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Keywords:End-to-end Autonomous Driving·Vision Language Model arXiv:2604.17915v1 [cs.CV] 20 Apr 2026 2 Y.Zhang et al. 1 Introduction Recent breakthroughs in Vision Language Models (VLMs) [1,10,45,61] highlight their extraordinary multimodal reasoning capabilities. This success naturally in- spires the pursuit of Vision Language Action (VLA) models for autonomous driv- ing [28,60]. Yet integrating these foundational models into driving systems typ- ically demands intricate 3D structural modifications like Bird Eye View (BEV) modeling [18,24]. Such severe architectural deviations from native VLMs hinder the possibility of joint training with massive general domain data, fundamentally bottlenecking model scalability. Conversely, attempting to preserve the original"},{"citing_arxiv_id":"2604.15308","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RAD-2: Scaling Reinforcement Learning in a Generator-Discriminator Framework","primary_cat":"cs.CV","submitted_at":"2026-04-16T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RAD-2 uses a diffusion generator and RL discriminator to cut collision rates by 56% in closed-loop autonomous driving planning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"uating a more expressive manifold of future possibilities. 2.2 RL for Autonomous Driving Reinforcement learning (RL) [42, 44, 55, 59] has been widely explored to mitigate the causal confusion and poor generalization issues of imitation learning. While recent works integrate RL with 3DGS-based digital twins [7], reasoning-oriented fine-tuning [18], or GRPO-based gener- ation [29, 34, 66], optimizing high-dimensional driving out- puts (e.g., raw trajectories) under sparse rewards remains notoriously difficult due to severe credit assignment chal- lenges [13, 29, 35, 38, 39, 66]. Unlike these direct opti- mization approaches, we utilize RL rewards to train a low- dimensional trajectory discriminator, effectively reformu- lating the complex planning task into a tractable preference"},{"citing_arxiv_id":"2604.11734","ref_index":14,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SCORP: Scene-Consistent Multi-agent Diffusion Planning with Stable Online Reinforcement Post-Training for Cooperative Driving","primary_cat":"cs.RO","submitted_at":"2026-04-13T17:13:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SCORP delivers 10-28% gains in safety and 2-7% in efficiency metrics on WOMD by using dual-path scene conditioning in diffusion planning plus variance-gated group-relative policy optimization for closed-loop stability.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"based multi-agent planners in real-world deployment [9]. Reinforcement learning (RL) offers a compelling alter- native for enhancing pretrained planning models by cou- pling sampling-based exploration with reward-driven policy optimization [10], [11]. Recent studies show that RL post- training can improve human-likeness, planning quality, and trajectory stability [12]-[14]. In this paradigm, a pretrained trajectory planner serves as the actor, sampling diverse candidate futures that are scored by a reward function and iteratively refined through RL algorithms [15], [16]. However, most existing work focuses onofflinepost-training, arXiv:2604.11734v2 [cs.RO] 14 Apr 2026 2 which is closer in spirit to reward-augmented supervised"},{"citing_arxiv_id":"2604.09159","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Truncated Rectified Flow Policy for Reinforcement Learning with One-Step Sampling","primary_cat":"cs.LG","submitted_at":"2026-04-10T09:44:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TRFP combines rectified flow models with truncation to support multimodal policies in MaxEnt RL while allowing fast one-step sampling and stable training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04857","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Blind Spot of Adaptation: Quantifying and Mitigating Forgetting in Fine-tuned Driving Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T17:02:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Fine-tuning VLMs for driving erodes pre-trained world knowledge, but shifting adaptation to prompt space via the Drive Expert Adapter preserves generalization while improving task performance.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"enhance generalization in unseen long-tail scenarios. But strikingly, this is rarely explored in VLM-centric methods, and existing benchmarks are ill-equipped to even detect such degradation. To demonstrate the real-world con- sequences of this neglect, we conducted a visual analysis on long-tail scenarios. As shown in Fig. 2, catastrophic for- getting causes RecogDrive [24] to overlook obstacles like 1 arXiv:2604.04857v1 [cs.CV] 6 Apr 2026 curbs and rocks that its base model, InternVL3-8B [63], pre- viously recognized, leading to unsafe trajectories. Our anal- ysis further reveals this is a widespread issue (see appendix), posing a substantial safety risk. This brings us to the central question of our work:how can we quantitatively evaluate"},{"citing_arxiv_id":"2604.02714","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ExploreVLA: Dense World Modeling and Exploration for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-04-03T04:14:13+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"source of intrinsic reward signals that encourage the policy to explore diverse and informative driving behaviors, thereby improving generalization beyond the coverage of the training data. 2.2 VLA Models for Autonomous Driving The integration of vision, language, and action within a unified framework has emerged as a promising paradigm for autonomous driving [23,29]. Early efforts, such as DriveGPT-4 [43], use frozen VLMs to narrate driving scenes but do not directly output control signals, serving only as passive explainers. Subsequent modular VLA approaches began embedding language into the planning loop. For example, OpenDriveVLA [57] fuses multimodal sensor inputs with textual route instructions to generate interpretable waypoints, while RAG-Driver [51]"},{"citing_arxiv_id":"2604.00813","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DVGT-2: Vision-Geometry-Action Model for Autonomous Driving at Scale","primary_cat":"cs.CV","submitted_at":"2026-04-01T12:21:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DVGT-2 is a streaming vision-geometry-action model that jointly reconstructs dense 3D geometry and plans trajectories online, achieving better reconstruction than prior batch methods while transferring directly to planning benchmarks without fine-tuning.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"ARTEMIS [8] C & L Map & Box 98.3 95.1 94.3 100 81.4 87.0 DiffusionDrive [41]C & L Map & Box 98.2 96.2 94.7 100 82.2 88.1 WoTE [35] C & L Map & Box 98.5 96.8 94.9 99.9 81.9 88.3 DriveSuprim [82] C & L Map & Box 97.8 97.3 93.6 100 86.7 89.9 AutoVLA [90] C Language 96.9 92.4 88.1 99.9 75.8 80.5 AdaThinkDrive [53]C Language 98.5 94.4 94.9 100 79.9 86.2 ReCogDrive [37] C Language 98.3 95.1 94.3 100 81.1 86.8 DriveVLA-W0 [34] C Future States 98.7 99.1 95.3 99.3 83.3 90.2 AutoVLA† [90] C Language & RL 98.4 95.6 98.0 99.9 85.9 89.1 ReCogDrive† [37] C Language & RL 98.2 97.8 95.2 99.8 83.5 89.6 DVGT-2 C Dense Geometry 97.8 97.2 93.9 100 83.4 88.6 DVGT-2-NAVSIMC Dense Geometry 98.7 97.9 95.8 100 84.3 90.3 Table 6: Closed-loop planning results on NAVSIM v2navtestsplit."},{"citing_arxiv_id":"2603.19675","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DynFlowDrive: Flow-Based Dynamic World Modeling for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-03-20T06:19:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DynFlowDrive models action-conditioned scene transitions via rectified flow in latent space and adds stability-aware trajectory selection, showing gains on nuScenes and NavSim without added inference cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.13842","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fine-tuning is Not Enough: A Parallel Framework for Collaborative Imitation and Reinforcement Learning in End-to-end Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-03-14T08:53:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PaIR-Drive runs IL and RL in parallel branches with a tree-structured sampler to reach 91.2 PDMS and 87.9 EPDMS on NAVSIM benchmarks while outperforming sequential RL fine-tuning and correcting some human errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.22801","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unleashing the Potential of Diffusion Models for End-to-End Autonomous Driving","primary_cat":"cs.RO","submitted_at":"2026-02-26T09:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces Hyper Diffusion Planner (HDP), a diffusion-based E2E AD framework that identifies insights on loss space, trajectory representation and data scaling, adds RL post-training, and reports 10x performance gains over 200 km of real-world testing across 6 scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.23421","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DriveLaW:Unifying Planning and Video Generation in a Latent Driving World","primary_cat":"cs.CV","submitted_at":"2025-12-29T12:32:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DriveLaW unifies video world modeling and trajectory planning by injecting video-generator latents into a diffusion planner, achieving SOTA video prediction and a new record on the NAVSIM planning benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.18662","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pseudo-Expert Regularized Offline RL for End-to-End Autonomous Driving in Photorealistic Closed-Loop Environments","primary_cat":"cs.RO","submitted_at":"2025-12-21T09:21:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pseudo-expert regularized offline RL reduces collisions and improves route completion for camera-based driving models trained on fixed simulator datasets from nuScenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.23369","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimScale: Learning to Drive via Real-World Simulation at Scale","primary_cat":"cs.CV","submitted_at":"2025-11-28T17:17:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SimScale synthesizes unseen driving states from real logs via neural rendering and reactive environments, generates pseudo-expert trajectories, and shows that co-training on real plus simulated data improves planning robustness and generalization on real benchmarks, with gains scaling by simulation ","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.16518","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MiMo-Embodied: X-Embodied Foundation Model Technical Report","primary_cat":"cs.RO","submitted_at":"2025-11-20T16:34:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiMo-Embodied is a single foundation model that achieves state-of-the-art results on 17 embodied AI benchmarks and 12 autonomous driving benchmarks through multi-stage learning, curated data, and CoT/RL fine-tuning that produces positive cross-domain transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.00088","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Alpamayo-R1: Bridging Reasoning and Action Prediction for Generalizable Autonomous Driving in the Long Tail","primary_cat":"cs.RO","submitted_at":"2025-10-30T01:25:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Alpamayo-R1 introduces a VLA model with a Chain of Causation dataset and multi-stage SFT-plus-RL training that reports 12% better planning accuracy and 35% fewer close encounters versus trajectory-only baselines in driving tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}