{"work":{"id":"4cd1db02-57ee-4017-9bf2-b9df24e0f9a9","openalex_id":null,"doi":null,"arxiv_id":"2408.16500","raw_key":null,"title":"CogVLM2: Visual Language Models for Image and Video Understanding","authors":null,"authors_text":"Wenyi Hong, Weihan Wang, Ming Ding, Wenmeng Yu, Qingsong Lv, Yan Wang","year":2024,"venue":"cs.CV","abstract":"Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in pursuit of enhanced vision-language fusion, efficient higher-resolution architecture, and broader modalities and applications. Here we propose the CogVLM2 family, a new generation of visual language models for image and video understanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image understanding model, CogVLM2 inherits the visual expert architecture with improved training recipes in both pre-training and post-training stages, supporting input resolution up to $1344 \\times 1344$ pixels. As a video understanding model, CogVLM2-Video integrates multi-frame input with timestamps and proposes automated temporal grounding data construction. Notably, CogVLM2 family has achieved state-of-the-art results on benchmarks like MMBench, MM-Vet, TextVQA, MVBench and VCGBench. All models are open-sourced in https://github.com/THUDM/CogVLM2 and https://github.com/THUDM/GLM-4, contributing to the advancement of the field.","external_url":"https://arxiv.org/abs/2408.16500","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-23T19:43:23.731164+00:00","pith_arxiv_id":"2408.16500","created_at":"2026-05-09T06:50:40.217773+00:00","updated_at":"2026-05-23T19:43:23.731164+00:00","title_quality_ok":true,"display_title":"CogVLM2: Visual Language Models for Image and Video Understanding","render_title":"CogVLM2: Visual Language Models for Image and Video Understanding"},"hub":{"state":{"work_id":"4cd1db02-57ee-4017-9bf2-b9df24e0f9a9","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":32,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2024-06-12T09:36:52+00:00","last_pith_cited_at":"2026-05-20T15:04:56+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-01T20:13:44.976224+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":6}],"polarity_counts":[{"context_polarity":"background","n":5},{"context_polarity":"unclear","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}