{"total":14,"items":[{"citing_arxiv_id":"2606.27660","ref_index":25,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MVPruner: Dynamic Token Pruning for Accelerating Multi-view Vision-Language Models in Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-06-26T02:33:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MVPruner is a two-stage adaptive token pruning technique for multi-view VLMs that achieves 87.3% FLOPs reduction and 4.97x prefilling speedup while retaining 98.5% accuracy on DriveLM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25136","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Memory Retrieval in Visuomotor Policies for Long-Horizon Robot Control","primary_cat":"cs.RO","submitted_at":"2026-06-23T20:07:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HALO distills VLM priors via question-answering objectives and applies sparse attention to enable reliable memory retrieval from up to eight minutes of history in imitation-learned visuomotor policies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24286","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AVOC: Enhancing Hour-Level Audio-Video Understanding in Omni-Modal LLMs via Retrieval-Inspired Token Compression","primary_cat":"cs.CL","submitted_at":"2026-06-23T08:06:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AVOC is a retrieval-inspired token compression framework that improves long-form audio-video understanding in multimodal LLMs by selecting informative tokens based on classical IR principles.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08641","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learnable Token Sparsification for Efficient Gigapixel Whole Slide Image Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-07T14:07:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Learnable sparsification framework compresses WSI visual tokens to 32 (0.78% of original) via SparseLearn, achieving 73.32% accuracy on SlideBench (TCGA) and outperforming baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31598","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Linear Scaling Video VLMs for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StateKV is an inference-time technique that replaces quadratic self-attention prefill in video VLMs with a fixed-capacity importance-based recurrent state, keeping accuracy near full attention on long-video benchmarks without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30876","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"dMoE: dLLMs with Learnable Block Experts","primary_cat":"cs.CL","submitted_at":"2026-05-29T06:03:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"dMoE aggregates token expert distributions to block level in dLLMs, cutting unique experts from 69.5 to 14.6, memory by 76-80%, and latency by 1.14-1.66x while retaining 99.11% performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30010","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EarlyTom: Early Token Compression Completes Fast Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:36:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EarlyTom is a training-free early token compression method inside the vision encoder with decoupled spatial selection that reduces TTFT up to 2.65x and FLOPs 61% on LLaVA-OneVision-7B while keeping accuracy comparable to full tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26584","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"O-MARC: Omni Memory-Augmented Compression Distillation for Efficient Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-26T06:07:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"O-MARC is a compression distillation framework that lets compact omnimodal models maintain or exceed full-token performance on video QA while cutting latency and memory by about 35%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25343","ref_index":200,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Toward Native Multimodal Modeling: A Roadmap","primary_cat":"cs.CV","submitted_at":"2026-05-25T01:57:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A roadmap that defines architectural nativity for multimodal models and categorizes them into Multi-to-Text, Multi-to-Target, and Multi-to-Multi types while outlining an industrial pipeline toward unified transformer-based native multimodal modeling.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"5 [199] show that multimodal contexts are already moving toward the million-token regime. Recent work therefore attacks sequence explosion from two complementary directions: reducing the number of multimodal tokens that enter the backbone, and redesigning the backbone or serving system so that very long streams can be processed without exhausting device memory [200]. Visual Resampling and Token Compression.The first line of work compresses visual features before, during, or immediately after visual encoding. Fixed-budget resamplers and pooling modules map dense patch grids into a small number of latent tokens, thereby stabilizing prefill latency regardless of the original image resolution. This idea appears in production-oriented models such as MiniCPM-V 4."},{"citing_arxiv_id":"2605.17837","ref_index":167,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Temporal Aware Pruning for Efficient Diffusion-based Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T04:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TAPE applies temporal-aware token pruning with smoothing, reselection, and timestep scheduling to speed up video diffusion models while preserving visual fidelity and coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12056","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniRefine: Alignment-Aware Cooperative Compression for Efficient Omnimodal Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-12T12:42:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OmniRefine introduces alignment-aware chunk refinement via similarity and dynamic programming followed by modality-cooperative token compression, achieving near-baseline accuracy at 44% token retention on WorldSense.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Llava-prumerge: Adaptive token reduction for efficient large multimodal models. InProceedings of the IEEE/CVF International Conference on Computer Vision, pages 22857-22867, 2025. [37] Kele Shao, Keda Tao, Can Qin, Haoxuan You, Yang Sui, and Huan Wang. Holitom: Holistic token merging for fast video large language models.arXiv preprint arXiv:2505.21334, 2025. [38] Kele Shao, Keda Tao, Kejia Zhang, Sicheng Feng, Mu Cai, Yuzhang Shang, Haoxuan You, Can Qin, Yang Sui, and Huan Wang. When tokens talk too much: A survey of multimodal long- context token compression across images, videos, and audios.arXiv preprint arXiv:2507.20198, 2025. [39] Leqi Shen, Guoqiang Gong, Tao He, Yifeng Zhang, Pengzhang Liu, Sicheng Zhao, and"},{"citing_arxiv_id":"2605.11864","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Very Efficient Listwise Multimodal Reranking for Long Documents","primary_cat":"cs.IR","submitted_at":"2026-05-12T09:45:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ZipRerank delivers state-of-the-art multimodal listwise reranking accuracy for long documents at up to 10x lower latency via early interaction and single-pass scoring.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16366","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Fre-Res: Frequency-Residual Video Token Compression for Efficient Video MLLMs","primary_cat":"cs.CV","submitted_at":"2026-05-10T03:06:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fre-Res compresses video tokens by preserving spatial anchors and representing temporal dynamics with low-frequency residual tokens derived from 1D-DCT on inter-frame residuals, plus a Spatial-Guided Absorber to reinject the information.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.14582","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniZip: Audio-Guided Dynamic Token Compression for Fast Omnimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2025-11-18T15:22:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniZip introduces an audio-guided dynamic token compression framework that achieves 3.42X inference speedup and 1.4X memory reduction for omnimodal LLMs without any training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}