{"work":{"id":"d9e035c7-9e23-4cc2-ad3e-be080fbbf2d9","openalex_id":null,"doi":null,"arxiv_id":"2312.14238","raw_key":null,"title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","authors":null,"authors_text":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing","year":2023,"venue":"cs.CV","abstract":"The exponential growth of large language models (LLMs) has opened up numerous possibilities for multimodal AGI systems. However, the progress in vision and vision-language foundation models, which are also critical elements of multi-modal AGI, has not kept pace with LLMs. In this work, we design a large-scale vision-language foundation model (InternVL), which scales up the vision foundation model to 6 billion parameters and progressively aligns it with the LLM, using web-scale image-text data from various sources. This model can be broadly applied to and achieve state-of-the-art performance on 32 generic visual-linguistic benchmarks including visual perception tasks such as image-level or pixel-level recognition, vision-language tasks such as zero-shot image/video classification, zero-shot image/video-text retrieval, and link with LLMs to create multi-modal dialogue systems. It has powerful visual capabilities and can be a good alternative to the ViT-22B. We hope that our research could contribute to the development of multi-modal large models. Code and models are available at https://github.com/OpenGVLab/InternVL.","external_url":"https://arxiv.org/abs/2312.14238","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-17T01:09:30.455995+00:00","pith_arxiv_id":"2312.14238","created_at":"2026-05-09T22:34:07.578705+00:00","updated_at":"2026-05-17T01:09:30.455995+00:00","title_quality_ok":true,"display_title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","render_title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks"},"hub":{"state":{"work_id":"d9e035c7-9e23-4cc2-ad3e-be080fbbf2d9","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":32,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2023-11-27T17:33:21+00:00","last_pith_cited_at":"2026-05-09T23:47:46+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-17T06:09:25.628587+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":5},{"context_role":"baseline","n":3}],"polarity_counts":[{"context_polarity":"background","n":5},{"context_polarity":"baseline","n":3}],"runs":{},"summary":{},"graph":{},"authors":[]}}