{"work":{"id":"6dc43db8-227d-438e-8658-0c8acecba08a","openalex_id":null,"doi":null,"arxiv_id":"2303.11381","raw_key":null,"title":"MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action","authors":null,"authors_text":"Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Ehsan Azarnasab, Faisal Ahmed","year":2023,"venue":"cs.CV","abstract":"We propose MM-REACT, a system paradigm that integrates ChatGPT with a pool of vision experts to achieve multimodal reasoning and action. In this paper, we define and explore a comprehensive list of advanced vision tasks that are intriguing to solve, but may exceed the capabilities of existing vision and vision-language models. To achieve such advanced visual intelligence, MM-REACT introduces a textual prompt design that can represent text descriptions, textualized spatial coordinates, and aligned file names for dense visual signals such as images and videos. MM-REACT's prompt design allows language models to accept, associate, and process multimodal information, thereby facilitating the synergetic combination of ChatGPT and various vision experts. Zero-shot experiments demonstrate MM-REACT's effectiveness in addressing the specified capabilities of interests and its wide application in different scenarios that require advanced visual understanding. Furthermore, we discuss and compare MM-REACT's system paradigm with an alternative approach that extends language models for multimodal scenarios through joint finetuning. Code, demo, video, and visualization are available at https://multimodal-react.github.io/","external_url":"https://arxiv.org/abs/2303.11381","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T07:20:28.818635+00:00","pith_arxiv_id":"2303.11381","created_at":"2026-05-08T17:08:34.351861+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action","render_title":"MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action"},"hub":{"state":{"work_id":"6dc43db8-227d-438e-8658-0c8acecba08a","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":41,"external_cited_by_count":null,"distinct_field_count":6,"first_pith_cited_at":"2023-04-11T17:41:13+00:00","last_pith_cited_at":"2026-05-16T01:37:10+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-06T04:10:01.090255+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":14},{"context_role":"baseline","n":1},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":14},{"context_polarity":"baseline","n":1},{"context_polarity":"use_method","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}