{"work":{"id":"c3fa58f6-0efb-4d29-a673-a2f26228ba6d","openalex_id":null,"doi":null,"arxiv_id":"2503.19755","raw_key":null,"title":"ORION: A Holistic End-to-End Autonomous Driving Framework by Vision-Language Instructed Action Generation","authors":null,"authors_text":"Haoyu Fu, Diankun Zhang, Zongchuang Zhao, Jianfeng Cui, Dingkang Liang, Chong Zhang","year":2025,"venue":"cs.CV","abstract":"End-to-end (E2E) autonomous driving methods still struggle to make correct decisions in interactive closed-loop evaluation due to limited causal reasoning capability. Current methods attempt to leverage the powerful understanding and reasoning abilities of Vision-Language Models (VLMs) to resolve this dilemma. However, the problem is still open that few VLMs for E2E methods perform well in the closed-loop evaluation due to the gap between the semantic reasoning space and the purely numerical trajectory output in the action space. To tackle this issue, we propose ORION, a holistic E2E autonomous driving framework by vision-language instructed action generation. ORION uniquely combines a QT-Former to aggregate long-term history context, a Large Language Model (LLM) for driving scenario reasoning, and a generative planner for precision trajectory prediction. ORION further aligns the reasoning space and the action space to implement a unified E2E optimization for both visual question-answering (VQA) and planning tasks. Our method achieves an impressive closed-loop performance of 77.74 Driving Score (DS) and 54.62% Success Rate (SR) on the challenge Bench2Drive datasets, which outperforms state-of-the-art (SOTA) methods by a large margin of 14.28 DS and 19.61% SR.","external_url":"https://arxiv.org/abs/2503.19755","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-22T12:21:31.302259+00:00","pith_arxiv_id":"2503.19755","created_at":"2026-05-09T06:40:40.067940+00:00","updated_at":"2026-05-22T12:21:31.302259+00:00","title_quality_ok":true,"display_title":"ORION: A Holistic End-to-End Autonomous Driving Framework by Vision-Language Instructed Action Generation","render_title":"ORION: A Holistic End-to-End Autonomous Driving Framework by Vision-Language Instructed Action Generation"},"hub":{"state":{"work_id":"c3fa58f6-0efb-4d29-a673-a2f26228ba6d","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":20,"external_cited_by_count":null,"distinct_field_count":3,"first_pith_cited_at":"2025-06-09T03:14:04+00:00","last_pith_cited_at":"2026-05-18T08:55:32+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-02T13:54:46.490037+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":4},{"context_role":"baseline","n":1},{"context_role":"dataset","n":1}],"polarity_counts":[{"context_polarity":"background","n":4},{"context_polarity":"baseline","n":1},{"context_polarity":"use_dataset","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}