{"total":47,"items":[{"citing_arxiv_id":"2605.19307","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MetaRA: Metamorphic Robustness Assessment for Multimodal Large Language Model-based Visual Question Answering Systems","primary_cat":"cs.CV","submitted_at":"2026-05-19T03:37:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MetaRA applies metamorphic testing to VQA tasks and shows that MLLM models exhibit sensitivity to linguistic perturbations and superficial visual cues not detected by conventional accuracy benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18018","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"See What I Mean: Aligning Vision and Language Representations for Video Fine-grained Object Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-18T08:09:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWIM aligns cross-attention maps from object nouns to ground-truth masks during training on the new NL-Refer dataset to enable text-only fine-grained video object understanding in MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17776","ref_index":20,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CosFly-Track: A Large-Scale Multi-Modal Dataset for UAV Visual Tracking via Multi-Constraint Trajectory Optimization","primary_cat":"cs.RO","submitted_at":"2026-05-18T02:49:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CosFlyTrack provides 12,000 expert UAV trajectories with aligned RGB, depth, segmentation, pose, target state, and bilingual instructions to train visual tracking agents, yielding 53-69 point gains in success rate after fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17447","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FastOCR: Dynamic Visual Fixation via KV Cache Pruning for Efficient Document Parsing","primary_cat":"cs.CV","submitted_at":"2026-05-17T13:39:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FastOCR dynamically selects a small subset of visual tokens per decoding step using focal-guided pruning and cross-step reuse, retaining 98% accuracy on Qwen2.5-VL while attending to only 5% of tokens and cutting attention latency by 3x.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15542","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRS-GUI: Dynamic Region Search for Training-Free GUI Grounding","primary_cat":"cs.AI","submitted_at":"2026-05-15T02:27:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DRS-GUI introduces a dynamic region search method with Focus/Shift/Scatter actions and MCTS-based planning that improves GUI grounding accuracy by 14% on ScreenSpot-Pro for both general and GUI-specific MLLMs without any training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09223","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CREST: Curvature-Regulated Event-Centric Sampling for Efficient Long-Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-09T23:47:46+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07825","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Anisotropic Modality Align","primary_cat":"cs.MM","submitted_at":"2026-05-08T14:53:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Modality representations share dominant semantic geometry but have an anisotropic residual gap; AnisoAlign corrects source representations boundedly using target geometry for unpaired alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05045","ref_index":1,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Relations Break: Analyzing Relation Hallucination in Vision-Language Model Under Rotation and Noise","primary_cat":"cs.CV","submitted_at":"2026-05-06T15:41:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Mild rotations and noise significantly increase relation hallucinations in VLMs across models and datasets, with prompt and preprocessing fixes providing only partial relief.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24919","ref_index":18,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic AI for Remote Sensing: Technical Challenges and Research Directions","primary_cat":"cs.CV","submitted_at":"2026-04-27T18:59:49+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[3], Prithvi-2 [106], Copernicus-FM, T erraFM [27]... Datasets:Sentinel-1/2 [30] archives, GeoBench [55], Coperni- cus data [30]... Capability:Transferable representa- tions, Multi-sensor learning Limitations:Predictive inference, No analytical decision process Methods:General:GPT 5 [84], LLaVAOneVision [56], Flamingo [1], Kosmos-2 [86], Qwen 3 [127], InternVL [18]... EO-VLMs:Sky- SenseGPT [73], RemoteCLIP [66], GeoChat [54], EarthDial [102], TEOChat [48]... Datasets:RSVQA [69], EarthDial [102], GeoChat [54], UCM-Captions [72], GeoBench-VLM [26]... Capability:Natural language in- teraction, Query-able geospatial imagery, Multimodal grounding Limitations:No persistent reason- ing, No tool execution Methods:General:T oolOrchestra"},{"citing_arxiv_id":"2604.21786","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Codebooks to VLMs: Evaluating Automated Visual Discourse Analysis for Climate Change on Social Media","primary_cat":"cs.CV","submitted_at":"2026-04-23T15:44:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLMs recover reliable population-level trends in climate change visual discourse on social media even when per-image accuracy is only moderate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21409","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"S1-VL: Scientific Multimodal Reasoning Model with Thinking-with-Images","primary_cat":"cs.CV","submitted_at":"2026-04-23T08:23:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"S1-VL combines structured scientific reasoning with iterative image manipulation via code execution to reach state-of-the-art results on visual and scientific reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19844","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"If you're waiting for a sign... that might not be it! Mitigating Trust Boundary Confusion from Visual Injections on Vision-Language Agentic Systems","primary_cat":"cs.CV","submitted_at":"2026-04-21T11:27:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LVLM-based agents exhibit trust boundary confusion with visual injections and a multi-agent defense separating perception from decision-making reduces misleading responses while preserving correct ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12358","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Why and When Visual Token Pruning Fails? A Study on Relevant Visual Information Shift in MLLMs Decoding","primary_cat":"cs.CV","submitted_at":"2026-04-14T06:48:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Visual token pruning in MLLMs fails on complex reasoning due to Relevant Visual Information Shift during decoding, but the DSTP framework fixes it training-free across models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"reasoning tasks, while consistently yielding performance gains even across visual understanding benchmarks. Furthermore,DSTPdemonstrates ef- fectiveness across diverse state-of-the-art architectures, highlighting its generalizability and efficiency with minimal computational overhead. Keywords:MultimodalLLMs·VisualTokenPruning·VisualReasoning 1 Introduction Multimodal Large Language Models (MLLMs) [3,11,29] have demonstrated strong capabilities in bothvisual understanding(e.g., visual question answering (VQA)) andvisual reasoning, including visual-centric math, logical puzzles, and STEM-related tasks.1 However, these models typically rely on a massive number of visual tokens generated by a vision encoder [35,51]. Such a large number of visual tokens incur prohibitive computational and memory overhead"},{"citing_arxiv_id":"2604.09253","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Mosaic: Multimodal Jailbreak against Closed-Source VLMs via Multi-View Ensemble Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-10T12:09:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mosaic combines text perturbation, multi-view image optimization, and surrogate model ensembles to reduce reliance on any single open-source model and achieve higher attack success rates on commercial closed-source VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07282","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Are Face Embeddings Compatible Across Deep Neural Network Models?","primary_cat":"cs.CV","submitted_at":"2026-04-08T16:44:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Simple affine transformations align face embeddings across different DNN models, substantially improving cross-model identification and verification performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06036","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CodecSight: Leveraging Video Codec Signals for Efficient Streaming VLM Inference","primary_cat":"cs.DC","submitted_at":"2026-04-07T16:31:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CodecSight reuses video codec signals for online patch pruning before the vision transformer and selective KV-cache refresh in the LLM, delivering up to 3x higher throughput and 87% lower GPU compute than prior baselines with 0-8% F1 drop.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"ent VLMs adopt different ViT architectures and tokeniza- tion pipelines, we implement model-specific adaptations for each supported model family. These components use codec- derived motion information to identify redundant visual to- kens and prune them before feature extraction. We implement theKVC ReuserandKVC Refresher by extending LMCache v0.3.9 [13, 61] with 2,500 lines of Python code. Built on LMCache's cache-management prim- itives and chunk-based indexing, our implementation sup- ports selective KVC refresh for sliding-window video infer- ence. We extend its indexing and cache management logic to handle overlapping clips and GOP-aligned anchor selection for KVC refresh. We further add model-specific integration"},{"citing_arxiv_id":"2604.05900","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AICA-Bench: Holistically Examining the Capabilities of VLMs in Affective Image Content Analysis","primary_cat":"cs.CV","submitted_at":"2026-04-07T14:05:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AICA-Bench evaluates 23 VLMs on affective image analysis, identifies weak intensity calibration and shallow descriptions as limitations, and proposes training-free Grounded Affective Tree Prompting to improve performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04579","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Firebolt-VL: Efficient Vision-Language Understanding with Cross-Modality Modulation","primary_cat":"cs.CV","submitted_at":"2026-04-06T10:25:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Firebolt-VL introduces an LFM-based decoder and token-grid correlation to achieve linear-time vision-language inference with improved fine-grained grounding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14168","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAGE Celer 2.6 Technical Card","primary_cat":"cs.CL","submitted_at":"2026-03-24T09:03:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"SAGE Celer 2.6 is a new line of language models with inverse reasoning training, integrated vision, and strong performance on math, coding, and South Asian language benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.13779","ref_index":77,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AD-Copilot: A Vision-Language Assistant for Industrial Anomaly Detection via Visual In-context Comparison","primary_cat":"cs.CV","submitted_at":"2026-03-14T06:14:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AD-Copilot trains an MLLM on a new curated industrial dataset Chat-AD with a Comparison Encoder that uses cross-attention on image pairs, reaching 82.3% accuracy on MMAD and 3.35x gains on MMAD-BBox while generalizing and exceeding human experts on some tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.01785","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CodeOCR: On the Effectiveness of Vision Language Models in Code Understanding","primary_cat":"cs.CL","submitted_at":"2026-02-02T08:10:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multimodal LLMs process code as images to achieve up to 8x token compression, with visual cues like syntax highlighting aiding tasks and clone detection remaining resilient or even improving under compression.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"To ensure the generalizability of our findings, we evaluate seven state-of-the-art LLMs with mul- timodal capability spanning both proprietary and open-weight categories. Table 2 summarizes model details and official pricing as of January 30, 2026 [ 66]. The proprietary models include GPT-5-mini and GPT-5.1 [63, 64] from OpenAI, and Gemini-2.5-Pro, Gemini-3-Flash, and Gemini-3- Pro [26, 34, 35] from Google. For open-weight models, we include Qwen-3-VL with 235B parame- ters [11] and GLM-4.6v with 108B parameters [84], enabling reproducible research and architectural analysis. Importantly, these proprietary models have multimodal capability natively integrated, , Vol. 1, No. 1, Article . Publication date: April 2026. 8 Shi et al."},{"citing_arxiv_id":"2512.21815","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"High-Entropy Tokens as Multimodal Failure Points in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2025-12-26T01:01:25+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.05442","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Structured Labeling Enables Faster Vision-Language Models for End-to-End Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2025-06-05T12:59:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Introduces structured NuScenes-S dataset and 0.9B FastDrive VLM claiming 20% higher decision accuracy and over 10x inference speedup versus larger unstructured VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.21472","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mitigating Hallucination in Large Vision-Language Models via Adaptive Attention Calibration","primary_cat":"cs.CV","submitted_at":"2025-05-27T17:45:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAAC mitigates hallucinations in LVLMs via Visual-Token Calibration and Adaptive Attention Re-Scaling guided by model confidence, showing gains on CHAIR, AMBER, and POPE especially in long-form generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.09925","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FLARE: Fully Integration of Vision-Language Representations for Deep Cross-Modal Understanding","primary_cat":"cs.CV","submitted_at":"2025-04-14T06:33:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FLARE is a vision-language model family using text-guided vision encoding, context-aware alignment decoding, dual-semantic mapping loss, and text-driven VQA synthesis to achieve deep cross-modal integration, outperforming larger models with only 630 vision tokens at 3B scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.13923","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Qwen2.5-VL Technical Report","primary_cat":"cs.CV","submitted_at":"2025-02-19T18:00:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Qwen2.5-VL reports a vision-language model family using native dynamic-resolution ViT and absolute time encoding that matches GPT-4o on document and diagram tasks while supporting hour-long videos with second-level localization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.13106","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding","primary_cat":"cs.CV","submitted_at":"2025-01-22T18:59:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VideoLLaMA3 uses a vision-centric training paradigm and token-reduction design to reach competitive results on image and video benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"From each cluster, we select a fixed number of images. This approach ensures diversity within the dataset while maintaining a balanced distribution of semantic categories, improving the model's ability to generalize across various visual content. 5) Image Re-caption. After filtering and clustering the images, we proceed with detailed re-captioning. Brief captions are generated using InternVL2-8B [31, 53], while the detailed captions are produced with InternVL2- 26B [31, 53]. These two types of captions (VL3-Syn7M-short and VL3-Syn7-detailed) are employed at different stages of training to address varying needs. Through the aforementioned cleaning and re-caption process, we created the VL3-Syn7M dataset, which consists of 7 million image-caption pairs."},{"citing_arxiv_id":"2501.05067","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion for Video Understanding","primary_cat":"cs.CV","submitted_at":"2025-01-09T08:43:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLaVA-Octopus introduces instruction-driven adaptive fusion of multiple visual projectors in a multimodal LLM to improve video understanding performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.02955","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MotionBench: Benchmarking and Improving Fine-grained Video Motion Understanding for Vision Language Models","primary_cat":"cs.CV","submitted_at":"2025-01-06T11:57:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MotionBench is a new benchmark showing poor fine-grained motion understanding in VLMs and proposes TE Fusion to improve performance with higher frame rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.01957","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","primary_cat":"cs.CV","submitted_at":"2025-01-03T18:59:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VITA-1.5 integrates vision and speech into a single LLM through multi-stage training, delivering competitive benchmark results on image, video, and speech tasks with near real-time response speed.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.10302","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding","primary_cat":"cs.CV","submitted_at":"2024-12-13T17:37:48+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepSeek-VL2 is a series of MoE vision-language models using dynamic tiling and latent attention that reach competitive or state-of-the-art results on VQA, OCR, document understanding and grounding with 1.0B to 4.5B activated parameters.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"10 5 Evaluation 11 5.1 Multimodal Performance . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 5.2 Qualitative Study . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 12 6 Conclusion 20 2 1. Introduction Large Vision-Language Models (VLMs) have emerged as a transformative force in artificial intelligence [15, 54, 59, 63, 83, 88, 94], extending the remarkable capabilities of Large Language Models (LLMs) to seamlessly process both visual and textual information. This advancement has dramatically expanded the potential for AI systems to tackle complex real-world applications that require multimodal understanding. In this technical report, we present DeepSeek-VL2, a new series of open-source Vision-"},{"citing_arxiv_id":"2411.10442","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization","primary_cat":"cs.CL","submitted_at":"2024-11-15T18:59:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mixed Preference Optimization with the MMPR dataset boosts multimodal CoT reasoning, lifting InternVL2-8B to 67.0 accuracy on MathVista (+8.7 points) and matching the 76B model.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"0 on MathVista [62], outperforming InternVL2-8B by 8.7 points and achieving performance comparable to the 10× larger InternVL2-76B. 2. Related Work Multimodal Large Language Models. With advance- ments in LLMs, significant progress has also been made in 2 MLLMs. To leverage the abilities of pre-trained LLMs [5, 11, 27] and Vision Foundation Models (VFMs) [19, 80], a series of works [20, 46, 47, 54, 57, 99, 102, 103] employ a connector to align their latent space, achieving promising performance at a controllable cost. Besides, another series of works [2, 27, 94, 100] extend pre-trained LLMs with ad- ditional fusion layers for vision features, reducing the num- ber of visual tokens required by LLMs while introducing"},{"citing_arxiv_id":"2410.17434","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding","primary_cat":"cs.CV","submitted_at":"2024-10-22T21:21:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LongVU adaptively compresses long video tokens using DINOv2-based frame deduplication, text-guided cross-modal selection, and temporal spatial reduction to improve video-language understanding in MLLMs with minimal detail loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.04509","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ErrorRadar: Benchmarking Complex Mathematical Reasoning of Multimodal Large Language Models Via Error Detection","primary_cat":"cs.CL","submitted_at":"2024-10-06T14:59:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ErrorRadar is a new benchmark of 2,500 multimodal K-12 math problems for MLLM error step identification and categorization, where GPT-4o trails human experts by ~10%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.16500","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CogVLM2: Visual Language Models for Image and Video Understanding","primary_cat":"cs.CV","submitted_at":"2024-08-29T12:59:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CogVLM2 family achieves state-of-the-art results on image and video understanding benchmarks through improved visual expert architecture, higher resolution inputs, and automated temporal grounding for videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.13257","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?","primary_cat":"cs.CV","submitted_at":"2024-08-23T17:59:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MME-RealWorld is the largest manually annotated high-resolution benchmark for MLLMs, where even the best models achieve less than 60% accuracy on challenging real-world tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.03326","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLaVA-OneVision: Easy Visual Task Transfer","primary_cat":"cs.CV","submitted_at":"2024-08-06T17:59:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLaVA-OneVision is the first single open LMM to simultaneously achieve strong performance in single-image, multi-image, and video scenarios with cross-scenario transfer capabilities.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"To validate the performance for single-image tasks in real-world scenories, we consider a comprehen- sive set of image benchmarks in Table 3. It can be categorized into three classes: (1) Chart, Diagram, and Document Understanding. As the main visual formats for structured OCR data, we evaluate the results on AI2D [ 54], ChartQA [101], DocVQA [103], and InfoVQA [102] benchmarks. Though current open-source models such as InternVL [22] and Cambrian [133] achieve performance comparable to commercial models, LLaV A-OneVision goes a step further, surpassing GPT-4V [109] and approaching the performance level of GPT-4o [110]. (2) Perception and Multi-discipline Reasoning. Including visual perception scenarios, we reveal the potentials of our model for more complex and challenging reasoning tasks."},{"citing_arxiv_id":"2407.07726","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PaliGemma: A versatile 3B VLM for transfer","primary_cat":"cs.CV","submitted_at":"2024-07-10T14:57:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"PaliGemma is an open 3B VLM based on SigLIP and Gemma that achieves strong performance on nearly 40 diverse open-world tasks including benchmarks, remote-sensing, and segmentation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Similarly, transferring the 224px checkpoint at 11 PaliGemma: A versatile 3B VLM for transfer resolution 448px (third bar, orange), while im- proving the results significantly, still lags far be- hind transferring the checkpoint whose native resolution is 448px (last bar, green). Thus, in the absence of flexible-resolution mod- eling tricks such as FlexiViT [13] or NaViT [30], we recommend running extended pretraining for increasing resolution (Stage2) and providing sep- arate checkpoints for all supported resolutions. 5.7.3. To resize or to window? Another recently common way of increasing in- putresolutionisby\"windowing\"themodels[ 114, 121, 134], i.e. applying the same model on win- dows of the model's native resolution from the"},{"citing_arxiv_id":"2407.03320","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output","primary_cat":"cs.CV","submitted_at":"2024-07-03T17:59:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InternLM-XComposer-2.5 is a 7B vision-language model supporting up to 96K context that reaches GPT-4V-level performance on image, video, and multi-turn tasks and adds LoRA-driven text-image composition capabilities.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"used to make the chosen or rejected prediction on the re- maining prompt-response pairs. These selected responses 8 MVBench MLVU MME MMB ∗1 Temp∗2 Doc Chart Info Text OCR WTQ Deep Visual Tab Video Video Compass VQA QA VQA VQA Bench Form MRC Fact Open-Source VideoChat InternVL LIV A InternVL Qwen-VL InternVL InternVL InternVL InternVL GLM-4v DocOwl DocOwl DocOwl DocOwl Previous SOTA 2-7B[71] 1.5-26B[26] 34B[78] 1.5-26B[26] 7B[6]1.5-26B[26] 1.5-26B[26] 1.5-26B[26] 1.5-26B[26] 9B[43] 1.5-8B[50] 1.5-8B[50] 1.5-8B[50] 1.5-8B[50] Performance 60.4 50.4 59.0 42.0 58.4 90.9 83.8 72.5 80.6 77.6 40.6 68.8 246.4 80.2 Closed-source API GPT-4V [112] 43.5 49.2 59.9 56.0 - 88.4 78.5 75.1 78.0 51.6 - - - - Gemini-Pro [142] - - 75.0 49.3 70.6 88.1 74.1 75.2 74.6 68."},{"citing_arxiv_id":"2407.01284","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"We-Math: Does Your Large Multimodal Model Achieve Human-like Mathematical Reasoning?","primary_cat":"cs.AI","submitted_at":"2024-07-01T13:39:08+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WE-MATH benchmark reveals most LMMs rely on rote memorization for visual math while GPT-4o has shifted toward knowledge generalization.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"We examine the performance of foundation models across two distinct cate- gories on WE-M ATH: (a) Closed-source LMMs: GPT-4o [38], GPT-4V [26], Gemini 1.5 Pro [40], Qwen-VL-Max [13], (b) Open-source LMMs: LLaV A-NeXT-110B, LLaV A-NeXT-70B [39], LLaV A- 1.6-13B, LLaV A-1.6-7B [41], DeepSeek-VL-1.3B, DeepSeek-VL-7B [42], Phi3-Vision-4.2B [43], MiniCPM-Llama3-V 2.5 [44], InternLM-XComposer2-VL-7B [45], InternVL-Chat-V1.5 [46], GLM- 4V-9B [47], LongV A [48], G-LLaV A-13B [29]. 3.1 Main Result Table 1 shows the overall performance of different LMMs on One-Step / Two-Step / Three-Step problems and different problem domains. We have the following observations: The Nums of Knowledge Concepts are negatively correlated with LMMs' Performance.Regard- ing problems of varying complexities (one-step vs."},{"citing_arxiv_id":"2406.16852","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Long Context Transfer from Language to Vision","primary_cat":"cs.CV","submitted_at":"2024-06-24T17:58:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Extending language model context length enables LMMs to process over 200K visual tokens from long videos without video training, achieving SOTA on Video-MME via dense frame sampling.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"long context transfer, we trained LLaV A-Next-Qwen2, a baseline model based on Qwen2-7B-Instruct using the LLaV A-NeXT [46, 35] training recipe. Additionally, we trained LongV A ( AnyRes) to showcase the advantages of our UniRes encoding scheme. The difference between LongV A and our baselines can be found in Table 5. 6 Model LLM Params FramesShort Medium Long Overall InternVL-Chat-V1.5 [14] 20B 010 60.2 46.4 45.6 50.7 LLaV A-NeXT-Video-34B [89]34B 032 61.7 50.1 44.3 52.0 VILA-1.5 [43] 34B 008 68.1 58.1 50.8 59.0 Qwen-VL-Chat [68] 07B 004 46.9 38.7 37.8 41.1 Video-LLaV A [42] 07B 008 45.3 38.0 36.2 39.9 ST-LLM [49] 07B 064 45.7 36.8 31.3 37.9 VideoChat2-Mistral [39] 07B 016 48.3 37.0 33.2 39.5 Chat-UniVi-V1.5 [30] 07B 064 45.7 40.3 35."},{"citing_arxiv_id":"2406.09411","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MuirBench: A Comprehensive Benchmark for Robust Multi-image Understanding","primary_cat":"cs.CV","submitted_at":"2024-06-13T17:59:52+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MuirBench is a new benchmark showing that top multimodal LLMs struggle with robust multi-image understanding, with GPT-4o at 68% and open-source models below 33% accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.16821","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","primary_cat":"cs.CV","submitted_at":"2024-04-25T17:59:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"InternVL 1.5 narrows the performance gap to proprietary multimodal models via a stronger transferable vision encoder, dynamic high-resolution tiling, and curated English-Chinese training data.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"monly employ a 300 million parameter vision foundation model (VFM), which is integrated with either a 7 billion or 13 billion LLMs. (2) Image Resolution : Proprietary commercial models typically employ a dynamic resolution approach, preserving the original aspect ratio to facilitate detailed scene and document understanding. In contrast, open-source models generally train with fixed resolutions [18, 23, 62, 71, 117, 142], such as 336 ×336 and 448×448, leading to a considerable gap in capabilities relative to com- mercial counterparts. (3) Multilingual Capability: Propri- etary models often leverage extensive multilingual datasets for training, enhancing their performance across diverse languages. However, open-source models predominantly utilize English data, relying on the zero-shot capabilities of"},{"citing_arxiv_id":"2403.20330","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","primary_cat":"cs.CV","submitted_at":"2024-03-29T17:59:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Current LVLM benchmarks overestimate capabilities because many questions can be answered without images due to design flaws or data leakage; MMStar is a human-curated set of 1,500 vision-indispensable samples across 6 capabilities and 18 axes with new metrics for leakage and true multi-modal gain.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"6 52.5 36.7 25.0 35.7CogVLM-Chat[45] (Vicuna-v1.5-7B[8]) 17B LVLM 34.2 63.4 66.3 63.3 68.7 34.7 55.1 LLM 37.1 10.5 53.6 57.3 37.3 21.7 36.3 LVLM-text 37.3 23.2 68.6 59.9 41.0 22.7 42.1Yi-VL[49] (Yi-34B[49]) 34B LVLM 43.2 71.5 75.3 65.9 68.1 25.6 58.3 LLM 37.6 20.1 69.4 60.2 35.0 17.9 40.0 LVLM-text 41.7 23.9 70.3 65.0 40.5 24.0 44.2InternVL-Chat-v1.2[6] (NH2-Yi-34B[33]) 40B LVLM 49.1 82.4 82.5 78.5 75.4 47.7 69.3 LLM 25.7 8.6 57.2 48.7 13.5 23.4 29.5 LVLM-text 43.6 20.5 68.4 61.1 39.9 28.4 43.7Sphinx-X-MoE[15] (Mixtral-8x7B[19]) 57B LVLM 44.8 69.2 72.2 65.0 71.1 38.1 60.1 Second issue: unintentional data leaking exists in LLM and LVLM training. Although the community has the trend towards developing new multi-modal benchmarks to assess LVLMs' ca-"},{"citing_arxiv_id":"2402.11684","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ALLaVA: Harnessing GPT4V-Synthesized Data for Lite Vision-Language Models","primary_cat":"cs.CL","submitted_at":"2024-02-18T19:26:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ALLaVA creates 1.3M GPT4V-synthesized samples enabling 4B VLMs to achieve competitive results on 17 benchmarks and match 7B/13B models on some tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.00253","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Survey on Hallucination in Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2024-02-01T00:33:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"This survey reviews the definition, symptoms, evaluation benchmarks, root causes, and mitigation methods for hallucinations in large vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.16502","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI","primary_cat":"cs.CL","submitted_at":"2023-11-27T17:33:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MMMU provides 11.5K heterogeneous college-level multimodal questions that current models solve at 56-59% accuracy, establishing a new standard for expert multimodal evaluation.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"6 33.6 45.3 30.2 InstructBLIP-T5-XXL [16] 35.7 33.8 48.5 30.6 27.6 33.6 49.8 29.4 BLIP-2 FLAN-T5-XXL [35] 35.4 34.0 49.2 28.6 27.3 33.7 51.5 30.4 InternLM-XComposer2-VL* [17] 43.0 38.2 56.8 32.8 30.1 39.8 60.7 31.8 Yi-VL-34B* [84] 45.9 41.6 56.1 33.3 32.9 45.9 66.5 36.0 LLaV A-1.6-34B* [46] 51.1 44.7 58.6 39.9 36.0 51.2 70.2 36.3 InternVL-Chat-V1.2* [11] 51.6 46.2 62.5 37.6 37.9 49.7 70.1 40.8 VILA1.5* [39] 51.9 46.9 62.1 40.6 37.7 51.7 74.0 39.5 Qwen-VL-MAX* [65] 51.4 46.8 64.2 39.8 36.3 52.5 70.4 40.7 SenseChat-Vision-0423-Preview* [68] 54.6 50.3 62.7 44.1 42.3 55.7 74.7 43.5 GPT-4V(ision) (Playground) [60] 56.8 55.7 65.3 64.3 48.4 63.5 76.3 41.7 Claude 3 Opus* [72] 59.4 - - - - - - - Gemini 1.5 Pro* [23] 62."}],"limit":50,"offset":0}