{"work":{"id":"a0e2e7ab-9d0a-4fdb-9309-3c8cf7ed0226","openalex_id":null,"doi":null,"arxiv_id":"2402.12289","raw_key":null,"title":"DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language Models","authors":null,"authors_text":"Xiaoyu Tian, Junru Gu, Bailin Li, Yicheng Liu, Yang Wang, Zhiyong Zhao","year":2024,"venue":"cs.CV","abstract":"A primary hurdle of autonomous driving in urban environments is understanding complex and long-tail scenarios, such as challenging road conditions and delicate human behaviors. We introduce DriveVLM, an autonomous driving system leveraging Vision-Language Models (VLMs) for enhanced scene understanding and planning capabilities. DriveVLM integrates a unique combination of reasoning modules for scene description, scene analysis, and hierarchical planning. Furthermore, recognizing the limitations of VLMs in spatial reasoning and heavy computational requirements, we propose DriveVLM-Dual, a hybrid system that synergizes the strengths of DriveVLM with the traditional autonomous driving pipeline. Experiments on both the nuScenes dataset and our SUP-AD dataset demonstrate the efficacy of DriveVLM and DriveVLM-Dual in handling complex and unpredictable driving conditions. Finally, we deploy the DriveVLM-Dual on a production vehicle, verifying it is effective in real-world autonomous driving environments.","external_url":"https://arxiv.org/abs/2402.12289","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T05:06:38.502115+00:00","pith_arxiv_id":"2402.12289","created_at":"2026-05-10T00:19:47.231506+00:00","updated_at":"2026-05-25T05:06:38.502115+00:00","title_quality_ok":true,"display_title":"DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language Models","render_title":"DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language Models"},"hub":{"state":{"work_id":"a0e2e7ab-9d0a-4fdb-9309-3c8cf7ed0226","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":52,"external_cited_by_count":null,"distinct_field_count":7,"first_pith_cited_at":"2024-06-12T17:59:21+00:00","last_pith_cited_at":"2026-05-22T02:31:32+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-31T12:32:41.025335+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":19},{"context_role":"baseline","n":1},{"context_role":"dataset","n":1}],"polarity_counts":[{"context_polarity":"background","n":19},{"context_polarity":"baseline","n":1},{"context_polarity":"use_dataset","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}