{"total":88,"items":[{"citing_arxiv_id":"2605.22158","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ST-SimDiff: Balancing Spatiotemporal Similarity and Difference for Efficient Video Understanding with MLLMs","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:27:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ST-SimDiff is a training-free method using a spatio-temporal graph and dual similarity-difference selection to compress video tokens for MLLMs while retaining static and dynamic content.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22078","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Visual Token Representations for Video Large Language Models via Training-Free Spatial-Temporal Pooling and Gridding","primary_cat":"cs.AI","submitted_at":"2026-05-21T07:16:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ST-GridPool improves video LLM performance via hierarchical temporal gridding and norm-based spatial pooling on visual tokens without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19846","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FineBench: Benchmarking and Enhancing Vision-Language Models for Fine-grained Human Activity Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-19T13:40:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19726","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Long-Context Modeling in Diffusion Language Models via Block Approximate Sparse Attention","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:01:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BA-Att introduces pre-downsampled block selection with norm-sorting and diagonal covariance correction to approximate sparse attention, yielding up to 6.95x speedup at 50% sparsity across language, multimodal, and video models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18018","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"See What I Mean: Aligning Vision and Language Representations for Video Fine-grained Object Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-18T08:09:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWIM aligns cross-attention maps from object nouns to ground-truth masks during training on the new NL-Refer dataset to enable text-only fine-grained video object understanding in MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17921","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Efficient Streaming Video Understanding Framework with Agentic Control","primary_cat":"cs.CV","submitted_at":"2026-05-18T06:29:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17065","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PyraVid: Hierarchical Multimodal Memory for Long-Horizon Video Reasoning","primary_cat":"cs.MA","submitted_at":"2026-05-16T16:15:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PyraVid is a hierarchical multimodal memory system that structures long videos into pyramids to improve long-horizon reasoning and evidence aggregation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16740","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRACE: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation","primary_cat":"cs.CV","submitted_at":"2026-05-16T01:37:10+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15342","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Minerva-Ego: Spatiotemporal Hints for Egocentric Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-14T19:12:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Minerva-Ego is a new benchmark for egocentric visual reasoning with dense human-annotated traces and masks, showing that spatiotemporal hints substantially improve frontier model performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14310","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoRDS: Coreset-based Representative and Diverse Selection for Streaming Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-14T03:22:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CoRDS selects a compact KV-cache subset via joint-space coreset coverage and log-det diversity to outperform token-wise heuristics on long-video VLM benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13803","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGround: Self-Evolving Video Agents for Video Temporal Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-13T17:25:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A proposer-solver agent pair achieves supervised-level video temporal grounding and fine-grained captioning from 2.5K unlabeled videos via self-reinforcing evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12034","ref_index":52,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Boosting Omni-Modal Language Models: Staged Post-Training with Visually Debiased Evaluation","primary_cat":"cs.MM","submitted_at":"2026-05-12T12:16:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Visual debiasing of omni-modal benchmarks combined with staged post-training lets a 3B model match or exceed a 30B model without a stronger teacher.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"InFindings of the Association for Computational Linguistics: EMNLP 2021, pages 3296-3315, 2021. URLhttps://aclanthology.org/2021.findings-emnlp.281/. [51] Dingdong Wang, Junan Li, Jincenzi Wu, Dongchao Yang, Xueyuan Chen, Tianhua Zhang, and Helen Meng. Mmsu: A massive multi-task spoken language understanding and reasoning benchmark, 2025. URLhttps://arxiv.org/abs/2506.04779. [52] Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. Can a suit of armor conduct electricity? a new dataset for open book question answering, 2018. URLhttps://arxiv.org/abs/1809.02789. [53] Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, and Le Hou. Instruction- following evaluation for large language models, 2023."},{"citing_arxiv_id":"2605.10936","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Personal Visual Context Learning in Large Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-05-11T17:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces Personal VCL formalization and benchmark revealing LMM context gaps, plus an Agentic Context Bank baseline that boosts personalized visual reasoning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"address inference-time reasoning over complex, dynamic personal context that unfolds across visual observations. Second, while visual in-context learning [ 41, 24, 17, 85, 79, 38, 19] shares our inference-time setting, its use of visual demonstrations serves to elicit task formatting from pre- trained knowledge, not to supply novel, private knowledge about a particular user. Finally, long- form [23, 76, 46, 84] and egocentric [29, 21, 37, 14, 50, 78, 10] video question answering (VQA) targetcontext acquisition: how to effectively search a long visual history to identify query-relevant evidence. Personal VCL targets the orthogonal subsequent stage: assuming relevant context has been mined from history, it investigates how to best leverage this visual context for personalized queries."},{"citing_arxiv_id":"2605.10966","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MMTB: Evaluating Terminal Agents on Multimedia-File Tasks","primary_cat":"cs.MM","submitted_at":"2026-05-08T10:57:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MMTB is a new benchmark with 105 multimedia terminal tasks that shows how audio and video access changes agent performance and evidence use in executable workflows.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"However, many real-world workflows require practitioners to work directly with multimedia files such as audio and video recordings. For instance, users may need to prepare media for broadcast or social platforms [24], provide feedback on music or acting performances [3], process meetings or compliance-sensitive recordings [ 15], or annotate audio- visual data for research [12]. Supporting such workflows requires terminal agents to move beyond multimedia understanding alone. They must ground decisions in auditory and visual evidence across files and execute the corresponding actions in a terminal environment. However, existing benchmarks lack multimedia-file tasks designed to evaluate terminal agents [16, 20, 21]. To this end, we introduce MultiMedia-TerminalBench (MMTB), a benchmark centered on multimedia-"},{"citing_arxiv_id":"2605.07355","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TTF: Temporal Token Fusion for Efficient Video-Language Model","primary_cat":"cs.CV","submitted_at":"2026-05-08T07:08:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TTF fuses temporally redundant visual tokens via local similarity search in a plug-and-play way, cutting ~67% tokens on Qwen3-VL-8B while retaining 99.5% accuracy with minimal overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03351","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLMaxxing through FrameMogging Training-Free Anti-Recomputation for Video Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-05T04:13:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Training-free adaptive reuse of stable visual state in video VLMs reduces follow-up latency by 15-36x on Qwen2.5-VL while preserving correctness on VideoMME, with smaller first-query speedups via pruning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Arman Cohan. TOMATO: Assessing Visual Temporal Reasoning Capabilities in Multimodal Foundation Models. ICLR 2025 poster; arXiv:2410.23266. https://openreview.net/forum? id=fCi4o83Mfs. [38] Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoﬀman. Token Merging: Your ViT But Faster . ICLR 2023; arXiv:2210.09461. [39] Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, Peixian Chen, Yanwei Li, Shaohui Lin, Sirui Zhao, Ke Li, Tong Xu, Xiawu Zheng, Enhong Chen, Caifeng Shan, Ran He, and Xing Sun. Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis."},{"citing_arxiv_id":"2605.03276","ref_index":10,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VEBench:Benchmarking Large Multimodal Models for Real-World Video Editing","primary_cat":"cs.CV","submitted_at":"2026-05-05T02:05:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VEBENCH is the first benchmark with 3.9K videos and 3,080 human-verified QA pairs that measures LMMs on video editing technique recognition and operation simulation, revealing a large gap to human performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[8] Michael Frierson.Film and Video Editing Theory. Rout- ledge, 2018. 2, 10 [9] Chaoyou Fu, Yuhan Dai, Yondong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al. Video-mme: The first-ever compre- hensive evaluation benchmark of multi-modal llms in video analysis.arXiv preprint arXiv:2405.21075, 2024. 2, 3, 6 [10] Chaoyou Fu, Haojia Lin, Xiong Wang, Yi-Fan Zhang, Yun- hang Shen, Xiaoyu Liu, Yangze Li, Zuwei Long, Het- ing Gao, Ke Li, et al. Vita-1.5: Towards gpt-4o level real-time vision and speech interaction.arXiv preprint arXiv:2501.01957, 2025. 17, 18 [11] Xin Gu, Libo Zhang, Fan Chen, Longyin Wen, Yufei Wang, Tiejian Luo, and Sijie Zhu. Edit3k: Universal representa-"},{"citing_arxiv_id":"2605.01662","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video Active Perception: Effective Inference-Time Long-Form Video Understanding with Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-03T01:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VAP is a training-free active-perception method that improves zero-shot long-form video QA performance and frame efficiency up to 5.6x in VLMs by selecting keyframes that differ from priors generated by a text-conditioned video model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25186","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FCMBench-Video: Benchmarking Document Video Intelligence","primary_cat":"cs.CV","submitted_at":"2026-04-28T03:45:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FCMBench-Video is a new benchmark with 1,200 videos and 11k QA instances for evaluating Video-MLLMs on document video understanding across 28 document types.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20473","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video-ToC: Video Tree-of-Cue Reasoning","primary_cat":"cs.CV","submitted_at":"2026-04-22T12:02:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video-ToC adds tree-guided cue localization, demand-based RL rewards, and automated datasets to video LLMs, reporting better results than prior methods on six understanding benchmarks plus a hallucination test.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"for drawing lines on a shirt is the center......Oh, I see. The point B is indeed on the center of the shirt......</think>\\n<answer>A</answer> Question: Where is the point B when drawing imaginary lines? A. On the center of the shirt. B. On the bottom of the shirt. C. On the shoulder. Fig. 1: Reasoning strategy comparison between Video-R1 and our Video-ToC. TempCompass [13], VideoMME [14], and VideoHallucer [15], demonstrating its clear advantage. To summarize, we make the following contributions: • We present Video-ToC, which is a novel video reasoning framework that introduces a tree-guided visual cue local- ization mechanism and a reasoning-demand-based reward strategy. This approach endows the model with enhanced fine- grained perceptual capabilities through structured reasoning"},{"citing_arxiv_id":"2604.16893","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EasyVideoR1: Easier RL for Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-18T07:56:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EasyVideoR1 delivers an optimized RL pipeline for video understanding in large vision-language models, achieving 1.47x throughput gains and aligned results on 22 benchmarks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"feeds data continuously, asynchronous queuing removes batch-boundary stalls, and chunked prefill prevents any single long sequence from monopolizing compute. Taking LVBench as an example, our pipeline achieves approximately6∼7×speedup over vanilla inference frameworks. 7 Table 2Video understanding benchmarks supported by the evaluation framework. Benchmark Task Type Number Metric General Video Understanding Video-MME [11] Multiple Choice 2,700 Accuracy Video-MME-v2 [12] Multiple Choice 3,200 Accuracy MVBench [19] Multiple Choice 3,586 Accuracy TempCompass [23] Multiple Choice 7,540 Accuracy MotionBench [14] Multiple Choice 3,715 Accuracy Long Video Understanding LVBench [37] Multiple Choice 1,492 Accuracy LongVideoBench [39] Multiple Choice 1,337 Accuracy MLVU [58] Multiple Choice 502 Accuracy"},{"citing_arxiv_id":"2604.15804","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Qwen3.5-Omni Technical Report","primary_cat":"cs.CL","submitted_at":"2026-04-17T08:05:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Qwen3.5-Omni scales an omnimodal model to hundreds of billions of parameters with 256k context, introduces ARIA for stable speech synthesis, and reports SOTA performance on 215 audio-visual benchmarks while adding multilingual and audio-visual coding capabilities.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"IEEE, 2021. Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. Visual instruction tuning.arXiv:2304.08485, 2023. Yuliang Liu, Zhang Li, Mingxin Huang, Biao Yang, Wenwen Yu, Chunyuan Li, Xu-Cheng Yin, Cheng-Lin Liu, Lianwen Jin, and Xiang Bai. Ocrbench: on the hidden mystery of ocr in large multimodal models. Science China Information Sciences, 67(12), December 2024. ISSN 1869-1919. doi: 10.1007/s11432-024-423 5-6. URLhttp://dx.doi.org/10.1007/s11432-024-4235-6. 19 Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, and Jianfeng Gao. Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. InICLR, 2024."},{"citing_arxiv_id":"2604.14149","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"One Token per Highly Selective Frame: Towards Extreme Compression for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"XComp reaches extreme video compression (one token per selective frame) via learnable progressive token compression and question-conditioned frame selection, lifting LVBench accuracy from 42.9 percent to 46.2 percent after tuning on 2.5 percent of standard data.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"into short segments, mitigating attention bias. Local attention highlights key frames relevant to the question and correct answer, enabling effective filtering of irrelevant content. 4.2 Main Results on Long Video Understanding Benchmarks.We assess our model on four widely used long video understanding benchmarks: LongVideoBench [64], MLVU [80], LVBench [59], and VideoMME [18] (w/o subtitles). Unlike short-video datasets, they feature extended video durations ranging from several minutes to over an hour, posing greater temporal reasoning and memory challenges. These benchmarks adopt a question-answering format, primarily using accuracy as the evaluation metric. Comparison.We choose the baselines with similar model size of VideoChat-Flash-2B [ 35] and"},{"citing_arxiv_id":"2604.16502","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Topology-Aware Layer Pruning for Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-14T14:36:53+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11177","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Thought Streams Matter? Evaluating Reasoning in Gemini Vision-Language Models for Video Scene Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-13T08:40:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Quality gains from extra thinking in Gemini models for video understanding plateau after the first few hundred tokens, Flash Lite balances quality and cost best, and tight reasoning budgets lead to compression-step hallucination where final outputs include un-reasoned content.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10060","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Mosaic: Cross-Modal Clustering for Efficient Video Understanding","primary_cat":"cs.PF","submitted_at":"2026-04-11T06:54:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mosaic uses cross-modal clusters as the unit for KVCache organization in VLMs to achieve up to 1.38x speedup in streaming long-video understanding.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"dently, incurring no additional communication overhead. This scalability highlights the potential of MOSAICfor deployment in larger-scale real-world streaming applications. IX. RELATEDWORK Long Video Understanding.Existing VLMs, such as Video- LLaV A [10] and LLaV A-OneVision [17], typically support uniform frame sampling and token compression. To support longer contexts, prior work [24], [25], [26], [27], [28] has explored adaptive keyframe sampling, dynamic token merg- ing, and memory compression. In addition, recent methods such as ViTL [29] introduce interleaved reasoning for span grounding. However, these approaches primarily follow an offline paradigm, assuming that the entire video is available before a query is issued. As a result, they rely on static, offline"},{"citing_arxiv_id":"2604.08457","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CrashSight: A Phase-Aware, Infrastructure-Centric Video Benchmark for Traffic Crash Scene Understanding and Reasoning","primary_cat":"cs.CV","submitted_at":"2026-04-09T16:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrashSight is a new infrastructure-focused benchmark showing that state-of-the-art vision-language models can describe crash scenes but fail at temporal and causal reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08077","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AdaSpark: Adaptive Sparsity for Efficient Long-Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-04-09T10:48:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AdaSpark delivers up to 57% FLOP reduction in Video-LLMs for long videos through adaptive cube- and token-level sparsity without apparent loss in performance on hour-scale benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07634","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VSAS-Bench: Real-Time Evaluation of Visual Streaming Assistant Models","primary_cat":"cs.CV","submitted_at":"2026-04-08T22:31:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VSAS-Bench offers temporally dense annotations and synchronous/asynchronous protocols to evaluate streaming VLMs on timeliness, consistency, accuracy, and latency trade-offs, showing that adapted conventional VLMs can outperform specialized streaming models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06132","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Claw-Eval: Towards Trustworthy Evaluation of Autonomous Agents","primary_cat":"cs.AI","submitted_at":"2026-04-07T17:43:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Claw-Eval is a new trajectory-aware benchmark for LLM agents that records execution traces, audit logs, and environment snapshots to evaluate completion, safety, and robustness across 300 tasks, revealing that opaque grading misses 44% of safety issues.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"https://deepmind.google/models/gemini/flash/, 2025. [7] G. DeepMind. Gemini 3.1 pro.https://deepmind.google/models/gemini/pro/, 2026. [8] S. Ding, X. Dai, L. Xing, S. Ding, Z. Liu, J. Yang, P. Yang, Z. Zhang, X. Wei, Y . Ma, H. Duan, J. Shao, J. Wang, D. Lin, K. Chen, and Y . Zang. Wildclawbench. https://github.com/InternLM/WildClawBench, 2026. GitHub repository. [9] C. Fu, Y . Dai, Y . Luo, L. Li, S. Ren, R. Zhang, Z. Wang, C. Zhou, Y . Shen, M. Zhang, et al. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis.ArXiv preprint, abs/2405.21075, 2024. [10] C. E. Jimenez, J. Yang, A. Wettig, S. Yao, K. Pei, O. Press, and K. R. Narasimhan. SWE- bench: Can language models resolve real-world github issues?"},{"citing_arxiv_id":"2604.05079","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SVAgent: Storyline-Guided Long Video Understanding via Cross-Modal Multi-Agent Collaboration","primary_cat":"cs.CV","submitted_at":"2026-04-06T18:30:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SVAgent improves long video question answering by constructing storylines via multi-agent collaboration and aligning cross-modal predictions for more robust, human-like reasoning.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Baselines and Models.We evaluate SV Agent on available Video MLLMs, primarily Qwen2.5-VL [2] and Qwen3- VL [36], spanning model sizes from 3B to 8B. We fur- ther compare against strong long-video baselines, including Videomind[24],Video-RAG[26], andVideoAgent[40]. Benchmarks.We evaluate SV Agent across four represen- tative long-video benchmarks: •Video-MME[9] is a widely used evaluation suite that spans 11 seconds to 1 hour and assesses detailed real- world long-video comprehension. •MLVU[62] contains videos ranging from 3 minutes to 2 hours (avg 12 min) and focuses on multi-stage events and long-range temporal dependencies. •LongVideoBench[42] targets referred and multi-hop reasoning, where questions require analysing long frame"},{"citing_arxiv_id":"2603.27259","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing the Scene Matters: Revealing Forgetting in Video Understanding Models with a Scene-Aware Long-Video Benchmark","primary_cat":"cs.CV","submitted_at":"2026-03-28T12:44:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SceneBench shows VLMs lose accuracy on scene-level questions in long videos due to forgetting, and Scene-RAG retrieval improves performance by 2.5%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.22779","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TrajTok: Learning Trajectory Tokens enables better Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-02-26T09:15:34+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.02276","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Kimi K2.5: Visual Agentic Intelligence","primary_cat":"cs.CL","submitted_at":"2026-02-02T16:17:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kimi K2.5 combines joint text-vision training with an Agent Swarm parallel orchestration framework to reach claimed state-of-the-art results on coding, vision, reasoning, and agent tasks while cutting latency up to 4.5 times.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"•Agentic Capabilities: BrowseComp [68], WideSearch [69],DeepSearchQA [60], FinSearchComp (T2&T3) [26], Seal-0 [45], GDPVal [43]. •Image Understanding:(math & reasoning)MMMU-Pro [75], MMMU (val) [76], CharXiv (RQ) [67], Math- Vision [61] and MathVista (mini) [36];(vision knowledge)SimpleVQA [13] and WorldVQA 2;(perception) ZeroBench (w/ and w/o tools) [48], BabyVision [12], BLINK [18] and MMVP [57];(OCR & document)OCR- Bench [35], OmniDocBench 1.5 [42] and InfoVQA [38]. •Video Understanding: VideoMMMU [25], MMVU [79], MotionBench [24], Video-MME [17] (with subtitles), LongVideoBench [70], and LVBench [62]. •Computer Use: OSWorld-Verified [72, 73], and WebArena [80]. BaselinesWe benchmark against state-of-the-art proprietary and open-source models."},{"citing_arxiv_id":"2601.15724","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VideoThinker: Building Agentic VideoLLMs with LLM-Guided Tool Reasoning","primary_cat":"cs.CV","submitted_at":"2026-01-22T07:47:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VideoThinker uses LLM-generated synthetic tool trajectories in caption space grounded to video frames to train agentic VideoLLMs that outperform baselines on long-video benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.14724","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HERMES: KV Cache as Hierarchical Memory for Efficient Streaming Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-01-21T07:26:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HERMES organizes the KV cache into a hierarchical memory to enable real-time streaming video understanding in MLLMs, achieving 10x faster TTFT and up to 11.4% accuracy gains on streaming benchmarks with 68% fewer tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.21334","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Streaming Video Instruction Tuning","primary_cat":"cs.CV","submitted_at":"2025-12-24T18:59:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Streamo is a streaming video LLM trained end-to-end on the new Streamo-Instruct-465K dataset that unifies multiple real-time video tasks with claimed strong temporal reasoning and generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.09608","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StreamingVLM: Real-Time Understanding for Infinite Video Streams","primary_cat":"cs.CV","submitted_at":"2025-10-10T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StreamingVLM enables stable real-time understanding of infinite video streams at up to 8 FPS using a streaming KV cache and aligned SFT on overlapped chunks, with a 66.18% win rate over GPT-4O mini on a new two-hour video benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.24943","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Perceive, Verify and Understand Long Video: Multi-Granular Perception and Active Verification via Interactive Agents","primary_cat":"cs.CV","submitted_at":"2025-09-29T15:42:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CogniGPT uses an interactive loop between a Multi-Granular Perception Agent and an Active Verification Agent to identify reliable clues in long videos with high accuracy and low frame usage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.17765","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Qwen3-Omni Technical Report","primary_cat":"cs.CL","submitted_at":"2025-09-22T13:26:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Qwen3-Omni is a unified multimodal model that achieves open-source SOTA on 32 of 36 audio and audio-visual benchmarks and overall SOTA on 22 without degrading performance on text, image, or video relative to single-modal Qwen counterparts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.02949","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProMQA-Assembly: Multimodal Procedural QA Dataset on Assembly","primary_cat":"cs.CL","submitted_at":"2025-09-03T02:26:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ProMQA-Assembly is a new multimodal procedural QA dataset with 646 pairs on assembly activities, built via LLM-generated candidates verified by humans plus 81 task graphs, and used to benchmark multimodal models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.18265","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","primary_cat":"cs.CV","submitted_at":"2025-08-25T17:58:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InternVL3.5 advances open-source multimodal models with Cascade RL for +16% reasoning gains and ViR for 4x inference speedup, with the 241B model reaching SOTA among open-source MLLMs on multimodal, reasoning, and agentic tasks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"5 and leading MLLMs in general capabilities. Hatched bars represent closed-source commercial models. We report average scores on a set of multimodal general, reasoning, text, and agentic benchmarks: MMBench v1.1 (en) [ 71], MMStar [11], BLINK [36], HallusionBench [41], AI2D [55], OCRBench [72], MMVet [168], MME-RealWorld (en) [178], MVBench [63], VideoMME [35], MMMU [170], MathVista [76], MathVision [134], MathVerse [175], DynaMath [189], WeMath [100], Log- icVista [153], MATH500 [45], AIME24 [84], AIME25 [85], GPQA [106], MMLU-Pro [146], GAOKAO [177], IFEval [185], SGP-Bench [102], VSI-Bench [161], ERQA [121], SpaCE-10 [38], and OmniSpatial [50]. community. Furthermore, the growth of multimodal capabilities, e."},{"citing_arxiv_id":"2508.10016","ref_index":65,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training-Free Multimodal Large Language Model Orchestration","primary_cat":"cs.CL","submitted_at":"2025-08-06T16:17:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM Orchestration integrates modality experts via an LLM controller, cross-modal memory, and interaction layer to enable multimodal input-output without gradient-based training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.21420","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReGATE: Learning Faster and Better with Fewer Tokens in MLLMs","primary_cat":"cs.CV","submitted_at":"2025-07-29T01:07:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReGATE introduces a teacher-student adaptive token elision method that reduces training tokens to 38% while matching or exceeding baseline accuracy on multimodal benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.04736","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChipSeek: Optimizing Verilog Generation via EDA-Integrated Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2025-07-07T08:08:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ChipSeek is a hierarchical-reward reinforcement learning framework with Curriculum-Guided Dynamic Policy Optimization that integrates EDA simulator feedback to improve LLM-generated RTL code on both functional correctness and PPA metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.04590","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLM2Vec-V2: Advancing Multimodal Embedding for Videos, Images, and Visual Documents","primary_cat":"cs.CV","submitted_at":"2025-07-07T00:51:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VLM2Vec-V2 is a multimodal embedding model trained on an extended MMEB-V2 benchmark that adds video and visual document tasks and reports gains on both new and prior image benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.01006","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2025-07-01T17:55:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GLM-4.5V reaches state-of-the-art results on 42 multimodal benchmarks among open-source models of similar size by applying reinforcement learning with curriculum sampling to a strong vision foundation model.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"MMLongBench-Doc [34]; •Visual Grounding: RefCOCO-avg (val) [25], TreeBench [54], Ref-L4 [6]; •GUI Agents: OSWorld [62], Android World [38], WebV oyager Some [18], Webquest-QA [56]; •Coding: Design2Code [43], Flame-React-Eval [1]; • Spatial Reco & Reasoning: OminiSpatial [24], CV-Bench [51], ERQA [49], All-Angles Bench [65]; • Video Understanding: VideoMME [ 10], MMVU [ 71], VideoMMMU [ 22], LVBench [ 58], MotionBench [19], MVBench [28]; Setting.We mostly use vLLM 1 as the backend for model inference. For faster and more stable inference, we use SGLang 2 for video inference. The maximum output length for each model response is set to 8,192 tokens. For visual input configuration, we set the maximum expected length for image"},{"citing_arxiv_id":"2506.05425","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SIV-Bench: A Video Benchmark for Social Interaction Understanding and Reasoning","primary_cat":"cs.CV","submitted_at":"2025-06-05T05:51:35+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SIV-Bench is a new video benchmark with 2,792 clips and 5,455 QA pairs that evaluates MLLMs on social scene understanding, state reasoning, and dynamics prediction using social relation theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.23617","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Trajectory, One Token: Grounded Video Tokenization via Panoptic Sub-object Trajectory","primary_cat":"cs.CV","submitted_at":"2025-05-29T16:25:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TrajViT tokenizes videos via panoptic sub-object trajectories, achieving 10x token reduction and outperforming ViT3D by 6% on retrieval and 5.2% on VideoQA tasks with faster training and inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.21374","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video-Holmes: Can MLLM Think Like Holmes for Complex Video Reasoning?","primary_cat":"cs.CV","submitted_at":"2025-05-27T16:05:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Video-Holmes benchmark shows top MLLMs achieve at most 45% accuracy on tasks needing integration of multiple clues from suspense films, unlike existing perception-focused tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}