{"work":{"id":"a650e873-dbbf-4963-af68-3896dcd4975b","openalex_id":null,"doi":null,"arxiv_id":"2411.10440","raw_key":null,"title":"LLaVA-CoT: Let Vision Language Models Reason Step-by-Step","authors":null,"authors_text":"Guowei Xu, Peng Jin, Ziang Wu, Hao Li, Yibing Song, Lichao Sun","year":2024,"venue":"cs.CV","abstract":"Large language models have demonstrated substantial advancements in reasoning capabilities. However, current Vision-Language Models (VLMs) often struggle to perform systematic and structured reasoning, especially when handling complex visual question-answering tasks. In this work, we introduce LLaVA-CoT, a large VLM designed to conduct autonomous multistage reasoning. Unlike chain-of-thought prompting, LLaVA-CoT independently engages in sequential stages of summarization, visual interpretation, logical reasoning, and conclusion generation. This structured approach enables LLaVA-CoT to achieve marked improvements on reasoning-intensive tasks. To accomplish this, we construct the LLaVA-CoT-100k dataset, integrating samples from various visual question answering sources and providing structured reasoning annotations. Besides, we propose a test-time stage-wise retracing search method (SWIRES), which enables effective and efficient test-time scaling. Remarkably, with only 100k training samples and test-time scaling, LLaVA-CoT not only outperforms its base model by 9.4% on a wide range of multimodal reasoning benchmarks, but also surpasses the performance of larger and even closed-source models, such as Gemini-1.5-pro, GPT-4o-mini, and Llama-3.2-90B-Vision-Instruct. The code, dataset, and pre-trained weights are publicly available at https://github.com/PKU-YuanGroup/LLaVA-CoT.","external_url":"https://arxiv.org/abs/2411.10440","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-23T04:47:33.580821+00:00","pith_arxiv_id":"2411.10440","created_at":"2026-05-09T05:45:21.972299+00:00","updated_at":"2026-05-23T04:47:33.580821+00:00","title_quality_ok":true,"display_title":"LLaVA-CoT: Let Vision Language Models Reason Step-by-Step","render_title":"LLaVA-CoT: Let Vision Language Models Reason Step-by-Step"},"hub":{"state":{"work_id":"a650e873-dbbf-4963-af68-3896dcd4975b","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":39,"external_cited_by_count":null,"distinct_field_count":6,"first_pith_cited_at":"2024-12-25T15:12:34+00:00","last_pith_cited_at":"2026-05-16T15:16:00+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-02T18:54:56.010971+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":9},{"context_role":"baseline","n":1}],"polarity_counts":[{"context_polarity":"background","n":9},{"context_polarity":"baseline","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}