{"work":{"id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","openalex_id":null,"doi":null,"arxiv_id":"2511.21631","raw_key":null,"title":"Qwen3-VL Technical Report","authors":null,"authors_text":"Shuai Bai, Yuxuan Cai, Ruizhe Chen, Keqin Chen, Xionghui Chen, Zesen Cheng, Lianghao Deng, Wei Ding, Chang Gao, Chunjiang Ge, et al","year":2025,"venue":"cs.CV","abstract":"We introduce Qwen3-VL, the most capable vision-language model in the Qwen series to date, achieving superior performance across a broad range of multimodal benchmarks. It natively supports interleaved contexts of up to 256K tokens, seamlessly integrating text, images, and video. The model family includes both dense (2B/4B/8B/32B) and mixture-of-experts (30B-A3B/235B-A22B) variants to accommodate diverse latency-quality trade-offs. Qwen3-VL delivers three core pillars: (i) markedly stronger pure-text understanding, surpassing comparable text-only backbones in several cases; (ii) robust long-context comprehension with a native 256K-token window for both text and interleaved multimodal inputs, enabling faithful retention, retrieval, and cross-referencing across long documents and videos; and (iii) advanced multimodal reasoning across single-image, multi-image, and video tasks, demonstrating leading performance on comprehensive evaluations such as MMMU and visual-math benchmarks (e.g., MathVista and MathVision). Architecturally, we introduce three key upgrades: (i) an enhanced interleaved-MRoPE for stronger spatial-temporal modeling across images and video; (ii) DeepStack integration, which effectively leverages multi-level ViT features to tighten vision-language alignment; and (iii) text-based time alignment for video, evolving from T-RoPE to explicit textual timestamp alignment for more precise temporal grounding. Under comparable token budgets and latency constraints, Qwen3-VL achieves superior performance in both dense and Mixture-of-Experts (MoE) architectures. We envision Qwen3-VL serving as a foundational engine for image-grounded reasoning, agentic decision-making, and multimodal code intelligence in real-world workflows.","external_url":"https://arxiv.org/abs/2511.21631","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-07-02T23:17:29.130530+00:00","pith_arxiv_id":"2511.21631","created_at":"2026-05-09T00:19:26.118281+00:00","updated_at":"2026-07-02T23:17:29.130530+00:00","title_quality_ok":false,"display_title":"Qwen3-VL Technical Report","render_title":"Qwen3-VL Technical Report"},"hub":{"state":{"work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","tier":"mega_hub","tier_reason":"1,000+ Pith inbound or 100,000+ external citations","pith_inbound_count":1024,"external_cited_by_count":null,"distinct_field_count":24,"first_pith_cited_at":"2025-03-12T08:33:46+00:00","last_pith_cited_at":"2026-07-01T17:24:26+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"needed","recognition_status":"needed","updated_at":"2026-07-02T23:22:59.122770+00:00","tier_text":"mega_hub"},"tier":"mega_hub","role_counts":[{"context_role":"background","n":121},{"context_role":"method","n":61},{"context_role":"baseline","n":50},{"context_role":"dataset","n":5},{"context_role":"other","n":4}],"polarity_counts":[{"context_polarity":"background","n":114},{"context_polarity":"use_method","n":61},{"context_polarity":"baseline","n":50},{"context_polarity":"unclear","n":10},{"context_polarity":"use_dataset","n":5},{"context_polarity":"support","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Qwen3-VL Technical Report","claims":[{"claim_text":"We introduce Qwen3-VL, the most capable vision-language model in the Qwen series to date, achieving superior performance across a broad range of multimodal benchmarks. It natively supports interleaved contexts of up to 256K tokens, seamlessly integrating text, images, and video. The model family includes both dense (2B/4B/8B/32B) and mixture-of-experts (30B-A3B/235B-A22B) variants to accommodate diverse latency-quality trade-offs. Qwen3-VL delivers three core pillars: (i) markedly stronger pure-text understanding, surpassing comparable text-only backbones in several cases; (ii) robust long-con","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Qwen3-VL Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:23:27.084014+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"2bc8cfa9-c2ce-48ef-87d4-e0de52ae9a92","orcid":null,"display_name":"Shuai Bai"},{"id":"a9b20716-8d96-433c-b3de-f324e0981bb1","orcid":null,"display_name":"Yuxuan Cai"},{"id":"2247732d-0c81-4a01-84ed-9bc6822610db","orcid":null,"display_name":"Ruizhe Chen"},{"id":"dd80061e-c973-48f3-af57-2764aa5433c7","orcid":null,"display_name":"Keqin Chen"},{"id":"a8165845-32cb-4e8c-b0b8-b1e52143e47d","orcid":null,"display_name":"Xionghui Chen"},{"id":"700e57de-112e-4ec6-b97c-f91ec5db4b65","orcid":null,"display_name":"Zesen Cheng"}]},"error":null,"updated_at":"2026-05-13T18:23:27.082140+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T18:13:33.628566+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":100},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":90},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":72},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":62},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":62},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":56},{"title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","work_id":"fe8637aa-12bc-4434-8d36-9f57b5eebcbe","shared_citers":54},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":51},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":48},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":47},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":45},{"title":"LLaVA-OneVision: Easy Visual Task Transfer","work_id":"f5f2452b-f2a9-49ac-b38d-c76e18cdfe49","shared_citers":44},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":44},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":40},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":31},{"title":"Qwen-Image Technical Report","work_id":"d06d7ecc-7579-4f89-a60b-4278a0f3c562","shared_citers":27},{"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","shared_citers":26},{"title":"SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features","work_id":"50eec732-2d41-432f-9dcf-ac7fff235ea5","shared_citers":25},{"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","shared_citers":24},{"title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","work_id":"366607ba-e4ea-4726-98c3-63356e32351c","shared_citers":24},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":23},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":22},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":22},{"title":"Kimi-VL Technical Report","work_id":"c876520f-8a20-44f3-b92a-bf7d35bd430f","shared_citers":22}],"time_series":[{"n":394,"year":2026}]},"error":null,"updated_at":"2026-05-13T17:25:55.904421+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T18:13:32.960754+00:00"},"reader_index":{"job_type":"reader_index","status":"succeeded","result":{"note":"annotated reader requires full-text/OA fetch; shell is wired for mega hubs","status":"reader queued"},"error":null,"updated_at":"2026-07-02T16:53:08.307556+00:00"},"recognition_alignment":{"job_type":"recognition_alignment","status":"succeeded","result":{"modules":["IndisputableMonolith.Information.Compression","IndisputableMonolith.Physics.DarkMatterCrossSectionBandScoreCard","IndisputableMonolith.Physics.StandardModelGroupStructure","IndisputableMonolith.Physics.StandardModelLagrangianStructure","IndisputableMonolith.Materials.RoomTSuperconductorCandidate","IndisputableMonolith.Foundation.AlexanderDualityProof","IndisputableMonolith.Linguistics.PhonemeInventoryBandFromRS","IndisputableMonolith.Unification.BlackHoleBandwidth"],"query_chars":1781},"error":null,"updated_at":"2026-07-02T16:53:08.111302+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Qwen3-VL Technical Report","claims":[{"claim_text":"We introduce Qwen3-VL, the most capable vision-language model in the Qwen series to date, achieving superior performance across a broad range of multimodal benchmarks. It natively supports interleaved contexts of up to 256K tokens, seamlessly integrating text, images, and video. The model family includes both dense (2B/4B/8B/32B) and mixture-of-experts (30B-A3B/235B-A22B) variants to accommodate diverse latency-quality trade-offs. Qwen3-VL delivers three core pillars: (i) markedly stronger pure-text understanding, surpassing comparable text-only backbones in several cases; (ii) robust long-con","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Qwen3-VL Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T18:23:26.664852+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Qwen3-VL Technical Report","claims":[{"claim_text":"We introduce Qwen3-VL, the most capable vision-language model in the Qwen series to date, achieving superior performance across a broad range of multimodal benchmarks. It natively supports interleaved contexts of up to 256K tokens, seamlessly integrating text, images, and video. The model family includes both dense (2B/4B/8B/32B) and mixture-of-experts (30B-A3B/235B-A22B) variants to accommodate diverse latency-quality trade-offs. Qwen3-VL delivers three core pillars: (i) markedly stronger pure-text understanding, surpassing comparable text-only backbones in several cases; (ii) robust long-con","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Qwen3-VL Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T17:25:52.720024+00:00"}},"summary":{"title":"Qwen3-VL Technical Report","claims":[{"claim_text":"We introduce Qwen3-VL, the most capable vision-language model in the Qwen series to date, achieving superior performance across a broad range of multimodal benchmarks. It natively supports interleaved contexts of up to 256K tokens, seamlessly integrating text, images, and video. The model family includes both dense (2B/4B/8B/32B) and mixture-of-experts (30B-A3B/235B-A22B) variants to accommodate diverse latency-quality trade-offs. Qwen3-VL delivers three core pillars: (i) markedly stronger pure-text understanding, surpassing comparable text-only backbones in several cases; (ii) robust long-con","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Qwen3-VL Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":100},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":90},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":72},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":62},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":62},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":56},{"title":"InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models","work_id":"fe8637aa-12bc-4434-8d36-9f57b5eebcbe","shared_citers":54},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":51},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":48},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":47},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":45},{"title":"LLaVA-OneVision: Easy Visual Task Transfer","work_id":"f5f2452b-f2a9-49ac-b38d-c76e18cdfe49","shared_citers":44},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":44},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":40},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":31},{"title":"Qwen-Image Technical Report","work_id":"d06d7ecc-7579-4f89-a60b-4278a0f3c562","shared_citers":27},{"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","shared_citers":26},{"title":"SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features","work_id":"50eec732-2d41-432f-9dcf-ac7fff235ea5","shared_citers":25},{"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","shared_citers":24},{"title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","work_id":"366607ba-e4ea-4726-98c3-63356e32351c","shared_citers":24},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":23},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":22},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":22},{"title":"Kimi-VL Technical Report","work_id":"c876520f-8a20-44f3-b92a-bf7d35bd430f","shared_citers":22}],"time_series":[{"n":394,"year":2026}]},"authors":[{"id":"dd80061e-c973-48f3-af57-2764aa5433c7","orcid":null,"display_name":"Keqin Chen","source":"manual","import_confidence":0.72},{"id":"2247732d-0c81-4a01-84ed-9bc6822610db","orcid":null,"display_name":"Ruizhe Chen","source":"manual","import_confidence":0.72},{"id":"2bc8cfa9-c2ce-48ef-87d4-e0de52ae9a92","orcid":null,"display_name":"Shuai Bai","source":"manual","import_confidence":0.72},{"id":"a8165845-32cb-4e8c-b0b8-b1e52143e47d","orcid":null,"display_name":"Xionghui Chen","source":"manual","import_confidence":0.72},{"id":"a9b20716-8d96-433c-b3de-f324e0981bb1","orcid":null,"display_name":"Yuxuan Cai","source":"manual","import_confidence":0.72},{"id":"700e57de-112e-4ec6-b97c-f91ec5db4b65","orcid":null,"display_name":"Zesen Cheng","source":"manual","import_confidence":0.72}]},"citers":{"total":1024,"items":[{"citing_arxiv_id":"2607.01191","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Perceive-to-Reason: Decoupling Perception and Reasoning for Fine-Grained Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-07-01T17:24:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"P2R decouples perception from reasoning in VLMs via a two-stage process and PRA-GRPO alternating RL training, reporting gains such as 93.2% on V-Star for the 4B model over its Qwen3-VL backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01117","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MoHallBench: A Benchmark for Motion Hallucination in Video Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-07-01T16:04:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MoHallBench is a new benchmark evaluating motion hallucination in VideoLLMs from co-occurrence priors, sequential inference, and similarity confusion, revealing decoupling from action recognition performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01086","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LongVQUBench: Benchmarking Long-Term Video Quality Understanding of Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-07-01T15:40:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongVQUBench introduces a hierarchical benchmark with local, cross-event, and global quality understanding tasks plus needle distortion QA to measure LVLMs' long-term video quality reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.01050","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoSearcher: Anchor-Guided Progressive Reasoning for Remote Sensing Visual Grounding with Process Supervision","primary_cat":"cs.CV","submitted_at":"2026-07-01T15:12:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoSearcher introduces anchor-centric reasoning supervised fine-tuning and process-faithful group relative policy optimization to improve MLLM-based remote sensing visual grounding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00983","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QCA: Query- and Content-Aware Keyframe Selection for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-07-01T14:19:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"QCA selects compact, query-relevant keyframes from long videos via segment-wise budget allocation and diversity-aware addition, achieving higher accuracy than GPT-4o on LongVideoBench with half the frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00920","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GMO-E$^2$DIT: Grounded Multi-Operation Editing for E-Commerce Images","primary_cat":"cs.CV","submitted_at":"2026-07-01T13:23:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GMO-E²DIT is an agentic framework that decouples VLM-based edit planning from mask-conditioned rendering using reflection loops for reliable multi-operation e-commerce image editing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00881","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniView-Space: Reinforcing Spatial Reasoning via Multi-Perspective Spatial Mapping","primary_cat":"cs.CV","submitted_at":"2026-07-01T12:45:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OmniView-Space framework with MPSM, tool-guided reasoning, and distillation achieves SOTA on spatial reasoning benchmarks for MLLMs while reducing external geometry dependencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00578","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Caption Bottleneck Models","primary_cat":"cs.CV","submitted_at":"2026-07-01T08:00:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Caption Bottleneck Models use LMM-generated image captions as the sole input to a text classifier, creating leakage-free interpretable models that discover dataset-specific concepts without predefined lists or manual labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00491","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MindEdit-Bench: Benchmarking Object-Level Counterfactual Spatial Reasoning in VLMs from In-the-Wild Photos","primary_cat":"cs.CV","submitted_at":"2026-07-01T06:19:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MindEdit-Bench introduces six spatial reasoning tasks from 120 private indoor photo triplets, with two new counterfactual editing tasks where VLMs score 8-31% against 81-97% human accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00465","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StochasT: Learning with Stochastic Turn Depth for Visual Instruction Tuning","primary_cat":"cs.CV","submitted_at":"2026-07-01T05:34:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StochasT uses stochastic clustering of language tasks into varying turn depths for the same image to improve LVLMs on both single-turn and multi-turn scenarios without discarding data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00457","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-scale Mixture of World Models for Embodied Agents in Evolving Environments","primary_cat":"cs.AI","submitted_at":"2026-07-01T05:23:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MuSix introduces scale-aware world model mixtures with experiential-distance routing and adaptive forgetting to improve multi-scale reasoning and dynamic adaptation in embodied agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00446","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VideoSearch-R1: Iterative Video Retrieval and Reasoning via Soft Query Refinement","primary_cat":"cs.CV","submitted_at":"2026-07-01T04:59:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VideoSearch-R1 achieves SOTA on VCMR across three datasets via iterative retrieval, latent-space soft query refinement, and GRPO training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00357","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Personalized Object Identification and Localization via In-Context Inference with Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-07-01T02:56:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IPLoc-ID extends prior localization-only work to full identification and localization by using a self-posed query in VLMs to reject negative images while preserving comparable localization accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00293","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rosetta: Composable Native Multimodal Pretraining","primary_cat":"cs.CV","submitted_at":"2026-07-01T00:42:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Rosetta proposes a composable multimodal pretraining method with MAOP to prevent catastrophic forgetting when expanding modalities beyond standard MoE and MoT approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00218","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EgoSafetyBench: A Diagnostic Egocentric Video Benchmark for Evaluating Embodied VLMs as Runtime Safety Guards","primary_cat":"cs.CV","submitted_at":"2026-06-30T21:50:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EgoSafetyBench shows VLMs reliably spot hazard-containing videos but miss specific contextual hazards and are degraded by misleading in-scene text.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00125","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decompose, Compare, and Decide: Multimodal LLMs are Implicit Few-Shot Learners","primary_cat":"cs.CV","submitted_at":"2026-06-30T20:00:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeCoDe decomposes few-shot classification into binary pairwise image comparisons whose affirmative logits serve as similarity scores, enabling strong performance from unmodified MLLMs on twelve datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00115","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PixelEyes: Decoupling Perception and Reasoning for Pinpoint Visual Evidence Seeking","primary_cat":"cs.CV","submitted_at":"2026-06-30T19:51:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PixelEyes decouples reasoning and perception via mask-guided search and semantic BFS, introduces PixelEyes-6K dataset and Pinpoint-Bench benchmark, and open-sources code and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32018","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Automated Background Swapping for Robustness against Spurious Backgrounds","primary_cat":"cs.CV","submitted_at":"2026-06-30T17:50:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoBackSwap uses foreground-background disentanglement via a secondary network plus background infilling to augment training data and reduce spurious background correlations in image classifiers, outperforming priors even without any counterexamples in the data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.32012","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoMet: Context and Multiplicity Decomposition for Multimodal Uncertainty Estimation","primary_cat":"cs.LG","submitted_at":"2026-06-30T17:46:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoMet decomposes MLLM uncertainty into context-specific and multiplicity-specific terms estimated by a trained post-hoc module, improving performance on open-ended multimodal benchmarks and hallucination detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31986","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoLT: Teaching Multi-Modal Models to Think with Chain of Latent Thoughts","primary_cat":"cs.CV","submitted_at":"2026-06-30T17:24:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoLT replaces text-based chain-of-thought in MLLMs with 3-step latent thought chains supervised by a removable external decoder in forward and backward modes, yielding 10.1x faster inference on eight benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31903","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attend, Transform, or Silence: Operator-Level Visual Skipping for Efficient Multimodal LLM Inference","primary_cat":"cs.CV","submitted_at":"2026-06-30T16:08:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper proposes an operator-level visual-token skipping framework for MLLMs that reduces TFLOPs by 33.7% on Qwen3-VL while retaining 99.5% performance across VQA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31875","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SENSE-VAD: Sentient and Semantic Video Anomaly Detection for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-06-30T15:57:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SENSE-VAD introduces the first synthetic benchmark dataset with per-frame labels for socially complex anomalies in autonomous driving scenes and shows existing video anomaly detectors fail on them.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31876","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Harnessing Textual Refusal Directions for Multimodal Safety","primary_cat":"cs.AI","submitted_at":"2026-06-30T15:57:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Textual refusal directions generalize across modalities in MLLMs, enabling the training-free MARS method that corrects misalignment and improves safety while preserving utility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31719","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing Is Not Sharing: Some Vision-Language Models Overestimate Common Ground in Asymmetric Dialogue","primary_cat":"cs.CL","submitted_at":"2026-06-30T14:22:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vision-language models overestimate common ground in asymmetric dialogues by treating map content as evidence of mutual understanding rather than tracking how grounding unfolds through interaction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31693","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ShopX: A Foundation Model for Intent-to-Item Fulfillment in Agentic Shopping","primary_cat":"cs.IR","submitted_at":"2026-06-30T14:05:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ShopX is a single foundation model combining intent understanding, planning, and SID-native item fulfillment for agentic shopping, with claimed improvements over tool-mediated systems on Taobao logs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31645","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Technical Report of RoboSpatial Challenge at CVPR 2026: Selective Reasoning Activation and Reference-Frame Disambiguation for Embodied Spatial Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-30T13:25:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"RoboSpatialBrain applies forced <think> prefix activation and reference-frame redirection to RoboBrain2.5-8B-NV, achieving first place with 80.9% success on RoboSpatial-Home.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31612","ref_index":118,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Memory Do GUI Agents Really Need? From Passive Records to Active Task-Driving States","primary_cat":"cs.CV","submitted_at":"2026-06-30T13:01:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes ATMem as active task-driving state memory and STR-GRPO RL to improve GUI agent reliability on long-horizon mobile tasks over passive record storage.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31599","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Token-Sparse Medical Multimodal Reasoning via Dual-Stream Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-06-30T12:47:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViToS uses dual-stream RL with cross-feedback optimization to prune medical image tokens to 77% length while reporting 108.27% and 104.16% relative performance on two 7B VLMs across seven benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31504","ref_index":76,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimpleSearch-VL: A Simple Recipe for Multimodal Agentic Deep Search","primary_cat":"cs.CV","submitted_at":"2026-06-30T11:22:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SimpleSearch-VL improves Qwen3-VL multimodal agent baselines by 15.8-16 points on average using 7K total training examples and reaches parity with Gemini-3-Pro on the 30B variant.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31471","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Think While You Map: Asynchronous Vision-Language Agents for Incremental 3D Scene Graphs","primary_cat":"cs.CV","submitted_at":"2026-06-30T10:49:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An asynchronous architecture decouples incremental voxel-based mapping from VLM-based semantic enrichment to produce queryable open-vocabulary 3D scene graphs that match or exceed prior methods on segmentation and grounding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31467","ref_index":93,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AeroVerse-SatAgent: UAV-Satellite Collaborative Spatial Reasoning Inspired by the Dual Visual Pathway Theory of Cognitive Neuroscience","primary_cat":"cs.CV","submitted_at":"2026-06-30T10:46:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SatAgent is a UAV-satellite collaborative spatial reasoning model using geometric 3D encoding, multi-view alignment, and a new 130K dataset that reports 25.91% and 11.69% gains over general and specialized baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31410","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Xiaomi-GUI-0 Technical Report","primary_cat":"cs.AI","submitted_at":"2026-06-30T09:36:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Xiaomi-GUI-0 reports 72.0% success on RealMobile and 78.9% on AndroidWorld via real-device closed-loop training with multi-source data and three-stage RL pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31407","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Visual Semantic Entropy: Do Vision Language Models Recognize Visual Ambiguity?","primary_cat":"cs.CV","submitted_at":"2026-06-30T09:35:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VSE perturbs images only to probe visual ambiguity in VLMs, clusters outputs into semantic prototypes, and computes mass-weighted dispersion, outperforming prior entropy methods on five VQA benchmarks across five models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31388","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Video, One World: Turning Monocular Video into Physical 4D Scenes","primary_cat":"cs.CV","submitted_at":"2026-06-30T09:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"OVOW reconstructs instance-level, simulation-ready 4D mesh scenes from monocular video via a four-stage training-free pipeline and introduces a new benchmark for structured Video-to-4D evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31329","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"3D HAMSTER: Bridging Planning and Control in Hierarchical Vision Language Action Models through 3D Trajectory Guidance","primary_cat":"cs.RO","submitted_at":"2026-06-30T08:31:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"3D HAMSTER adds depth encoding and reconstruction to VLMs to produce 3D waypoint sequences that feed directly into pointcloud policies, claiming better generalization than 2D baselines under shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31292","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AtomiMed: Hierarchical Atomic Fact-Checking for Universal Clinical-Aware Medical Report Evaluation","primary_cat":"cs.CE","submitted_at":"2026-06-30T08:07:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AtomiMed is a new modality-agnostic evaluation framework for medical report generation that decomposes reports into hierarchical atomic clinical facts and applies agentic cross-verification to achieve higher correlation with radiologist judgments than n-gram metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2607.00060","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Synergistic Perception-Reasoning Governance: Grounding Medical MLLMs with Verifiable Anatomical Evidence","primary_cat":"cs.CV","submitted_at":"2026-06-30T08:07:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A dual-side evidence-injection method using ROI-guided modulation and semantic token mapping improves medical MLLM close-ended accuracy by up to 6% and cuts open-ended hallucinations by 35% across 5 datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31257","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decodable Is Not Grounded: A Vision-Ablation Arbiter for VLM Spatial Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-30T07:33:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"A blank-image ablation test reveals that high probe accuracy on VLM spatial reasoning frequently reflects priors or inverted signs rather than image grounding, with horizontal grounded, vertical prior, and depth inverted.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31200","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic RAG-VLM: Affordance-Aware Retrieval-Augmented Generation with Self-Reflective Planning for Robotic Grasping","primary_cat":"cs.AI","submitted_at":"2026-06-30T06:30:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Agentic RAG-VLM achieves 78.3% success on a 12-task grasping benchmark with 360 trials per configuration, a 53.3 percentage-point gain over VLM-only baselines, via hierarchical affordance RAG, scene graph constraints, and a 14-type failure taxonomy with adaptive retry.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30638","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Open-Vocabulary and Referring Segmentation for 3D Gaussians Using 2D Detectors","primary_cat":"cs.CV","submitted_at":"2026-06-29T17:58:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GaussDet enables open-vocabulary and referring segmentation in 3D Gaussians by learning instance features and aggregating votes from 2D detectors, improving referential grounding by 16.7% mIoU in zero-shot setting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30632","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GROW$^2$: Grounding Which and Where for Robot Tool Use","primary_cat":"cs.RO","submitted_at":"2026-06-29T17:56:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GROW² hierarchically grounds open-world tool affordances by using VLMs for semantic selection of objects and parts followed by geometric localization with vision foundation models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30626","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DOPD: Dual On-policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-29T17:55:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DOPD is an advantage-aware dual distillation method that dynamically assigns token supervision from either privileged teacher or student to transfer capability while mitigating non-replicable information asymmetry in on-policy distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30599","ref_index":3,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Goku: A Million-Scale Universal Dataset and Benchmark for Instruction-Based Video Editing","primary_cat":"cs.CV","submitted_at":"2026-06-29T17:38:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Goku provides a 2M-pair dataset for multi-task structural video editing, Goku-Edit model with MLLM and dual-branch design, and Goku-Bench yielding up to 8% gains in instruction following.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30577","ref_index":7,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"APRIL-MedSeg: A Modular Medical Image Segmentation Toolbox Embracing Modern Paradigms","primary_cat":"cs.CV","submitted_at":"2026-06-29T17:20:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Presents APRIL-MedSeg, a modular YAML-configurable toolbox for 2D medical image segmentation integrating semi-supervised, domain adaptation, distillation, weakly supervised, text-guided, and foundation model paradigms with unified dataset and deployment interfaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30552","ref_index":5,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training Vision-Language-Action Models with Dense Embodied Chain-of-Thought Supervision","primary_cat":"cs.RO","submitted_at":"2026-06-29T16:48:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZR-0 is a dual-stream VLA model trained with dense ECoT supervision on 60M frames from 400K trajectories to enable cross-embodiment transfer in simulation and real-world settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30498","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Faithfulness of Post-Hoc Concept Bottleneck Models","primary_cat":"cs.CV","submitted_at":"2026-06-29T16:02:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Post-hoc CBMs produce unfaithful concept projections due to covariate shifts and systematic label noise; new metrics are introduced to measure faithfulness separately from accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30378","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniCoT: A Benchmark for Global and Multi-Step Panoramic Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-29T14:38:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniCoT is a new panoramic reasoning benchmark with 6.7K eval, 1K real, and 14.3K training examples plus a two-stage SFT+GRPO training method to enforce global 360-degree consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30367","ref_index":79,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FutureNav: Unified World-Action Modeling for Vision-and-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-29T14:33:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FutureNav proposes a 4B-scale VLM that jointly optimizes action prediction, inverse/forward dynamics, and future state generation for VLN and reports SOTA results on multiple benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30339","ref_index":102,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"REAR: Test-time Preference Realignment through Reward Decomposition","primary_cat":"cs.CL","submitted_at":"2026-06-29T14:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"REAR decomposes the reward into question and preference components, rescales their balance, and expresses the result as a linear combination of token log-probabilities for efficient integration with best-of-N and tree search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30251","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TACO: Tool-Augmented Credit Optimization for Agentic Tool Use","primary_cat":"cs.MA","submitted_at":"2026-06-29T13:01:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TACO combines Differential Answer-Probe Reward (DAPR) and Outcome-Gated Advantage Routing (OGAR) to assign credit to tool calls in agentic visual reasoning, producing accuracy gains on multimodal benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30168","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent Noise Mask for Reducing Visual Redundancy in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-29T11:44:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lens purifies visual evidence in MLLMs via question-conditioned latent noise masking with a LET token, yielding 2.4-6.4 point gains on VQA and grounding tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30124","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SciIR: A Large-scale Training Dataset and Benchmark for Scientific Image Reasoning Generation","primary_cat":"cs.CV","submitted_at":"2026-06-29T10:59:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces SciIR-82k dataset and SciIR-Bench for scientific image reasoning generation organized by Peirce's semiotic triad, with fine-tuning raising model score from 35% to 43%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30113","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SA-VLA: State-aware tokenizer for improving Vision-Language-Action Models' performance","primary_cat":"cs.RO","submitted_at":"2026-06-29T10:45:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SA-VLA adds state conditioning to VQ-based action tokenization in VLA policies, expanding each discrete token's effective support to state-dependent actions and raising average success rates from 0.29 to 0.56 on 12 sim tasks and 0.15 to 0.33 on 3 real tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30084","ref_index":117,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Forward Beats Two: InnerZoom for Accurate and Efficient GUI Grounding","primary_cat":"cs.CV","submitted_at":"2026-06-29T10:20:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InnerZoom bridges cross-layer evidence in one forward pass to achieve SOTA GUI grounding accuracy on six benchmarks while cutting latency up to 31.8% versus two-pass baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30058","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Emergence of a Shared Canonical Object Frame from In-the-Wild Videos","primary_cat":"cs.CV","submitted_at":"2026-06-29T09:48:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A coarse canonical mesh bottleneck plus multi-view consistency lets a shared object frame emerge from self-supervised training on in-the-wild videos without canonical labels or category conditioning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30054","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Illuminating Unified Multimodal Model for Free-form Interleaved Text-Image Generation","primary_cat":"cs.CV","submitted_at":"2026-06-29T09:45:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ILLUME-X is a unified multimodal model that generates free-form interleaved text-image sequences via an expanded data pipeline, progressive self-adaptive training, and ILScore evaluation, claiming outperformance over prior unified models on style transfer, image decomposition, and storytelling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30026","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MuseBench: Benchmarking Intent-Level Audiovisual Arts Understanding in MLLMs","primary_cat":"cs.CV","submitted_at":"2026-06-29T09:27:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MuseBench shows state-of-the-art MLLMs achieve only 48.29% accuracy on intent-level audiovisual arts understanding versus 87.18% for human experts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30019","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniDance: Multimodal Driven Dance Video Generation with Large-scale Internet Data","primary_cat":"cs.CV","submitted_at":"2026-06-29T09:23:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces CIPE-Dance as the largest dance video dataset and OmniDance framework for unified text-music multimodal dance video generation achieving SOTA on TI2V, MI2V, and MTI2V tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29984","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Be Faithful When Response: Returning Fluent and Grounded Answers for Vision-Language Models Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-06-29T08:58:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Faithful Warm-Start pre-training on causally consistent vision-language samples improves accuracy, stabilizes RL, and reduces unsupported reasoning in VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29936","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OpenSPM: An Environment-Transferable Robotic Key Spatial Pose Memory and Closed-Loop High-Frequency Flow-Matching Action Generation Model","primary_cat":"cs.RO","submitted_at":"2026-06-29T08:12:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OpenSPM extracts key spatial poses from demonstrations as transferable memory and generates high-frequency actions via flow-matching, achieving 85.6% success rate on ten LIBERO-GOAL tasks at 1033.3 Hz.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29928","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent-CURE for Breast Cancer Diagnosis","primary_cat":"cs.CV","submitted_at":"2026-06-29T08:05:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Latent-CURE introduces latent-space chain-of-thought reasoning and dual-asymmetric optimization to produce transparent, robust breast cancer diagnoses in imbalanced cohorts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29905","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StrucTab: A Structured Optimization Framework for Table Parsing","primary_cat":"cs.CV","submitted_at":"2026-06-29T07:41:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StrucTab achieves SOTA table parsing performance by unifying structural subtasks through sequential reasoning and using decomposed RL rewards in Uni-TabRL, plus a new TableVerse-5K benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29814","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nemotron-Labs-Diffusion-Image: Advancing Masked Discrete Diffusion for High-Resolution Image Synthesis","primary_cat":"cs.CV","submitted_at":"2026-06-29T05:48:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A masked discrete diffusion model adds token editing at inference and grouped cross-entropy training to reach 0.90 GenEval, 86.9 DPG, and 10.76 HPSv3 scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29812","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consistency as Inductive Bias: Learning Cross-View Invariance for Robust Multimodal Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-29T05:45:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ConsistRoll enforces cross-view consistency during RLVR training for MLLMs by joint rewards on grouped original and augmented views, yielding robustness gains on math, general, and hallucination benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29805","ref_index":1,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Clearer Sight, Fewer Lies: Oriented Pickup Preference Optimization for Multimodal Hallucination Mitigation","primary_cat":"cs.CV","submitted_at":"2026-06-29T05:33:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPPO is an evidence-aware preference optimization objective that contrasts faithful responses under varying visual evidence strengths to reduce hallucinations in MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29760","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MR-IQA: A Unified Margin View of Regression and Ranking for Blind Image Quality Assessment","primary_cat":"cs.CV","submitted_at":"2026-06-29T04:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MR-IQA unifies regression and ranking in BIQA via a quality-margin optimization framework in RL, showing competitive performance on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29712","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Struggle with Continuous Latents? Interpretable Discrete Latent Reasoning via Rendered Compression","primary_cat":"cs.CL","submitted_at":"2026-06-29T02:34:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DLR creates discrete latent tokens from rendered CoT images via clustering, enabling up to 20x compression and interpretable trajectories that outperform continuous latent baselines on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29531","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MotionAtlas: Detailed Region Captioning for Motion-Centric Videos","primary_cat":"cs.CV","submitted_at":"2026-06-28T17:54:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MotionAtlas supplies a 2,073-question benchmark, a self-bootstrap pipeline yielding 159k captions, and fine-tuned Video-MLLMs that deliver 5.2-point gains over Qwen3-VL-4B on motion tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29472","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent-Computer Observation Interfaces Enable Dynamic Computer Use","primary_cat":"cs.AI","submitted_at":"2026-06-28T15:59:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AOI adds keyframe capture, volume-gated audio transcription, and visual narration to computer-use agents, producing +17 to +48 pp gains over screenshot baselines on DynaCU-Bench with no retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29451","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Platonic Defense: Backdoor Defense for Self-Supervised Encoders in the Era of Large Scale Pre-training","primary_cat":"cs.CV","submitted_at":"2026-06-28T15:16:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces an attack-agnostic black-box defense for SSL encoders that trains a conditional energy function via NCE and DSM to detect and purify representations, with an energy gap lower-bounded by mutual information.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29445","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging VideoQA and Video-Guided Agentic Tasks via Generalized Keyframe Extraction","primary_cat":"cs.CV","submitted_at":"2026-06-28T15:11:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces VG-GUIBench benchmark and TASKER keyframe extraction algorithm that improves performance on VideoQA and video-guided agentic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29376","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SAD-GS: Learning Reliable 3D Semantic Gaussian Fields via Dynamic Geo-Semantic Anchoring","primary_cat":"cs.CV","submitted_at":"2026-06-28T12:52:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SAD-GS proposes dynamic geo-semantic anchoring via SAD and GSFL to learn reliable 3D semantic Gaussian fields, reporting best performance on LERF-OVS, 3D-OVS, and Mip-NeRF360 for open-vocabulary localization and segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29232","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Does Synthetic CT Transfer? A Label-Free Donor/Host Diagnostic for Medical Vision-Language Model Routing on Real Lung CT","primary_cat":"cs.CV","submitted_at":"2026-06-28T06:41:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Donor-driven nodule properties in synthetic CT transfer to real lung CT vision-language tasks while host-driven anatomy properties do not, enabling a label-free diagnostic for model routing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29028","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Keypose Exploration: Efficient Automatic Trajectory Labelling and Cross-Embodiment Policy Transfer","primary_cat":"cs.RO","submitted_at":"2026-06-27T17:48:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An automatic single-demo VLM trajectory labelling pipeline enables keypose-guided diffusion policies that match baseline performance and show preliminary benefits for cross-embodiment transfer on robomimic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28864","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On Test-Time Scaling for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-27T11:12:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Small well-performing LVLMs gain the most from test-time scaling with up to 30% improvements that can match or exceed larger models, while visual information is used mainly early in reasoning chains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28862","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HKVLM: Faithful Reasoning Grounding by Binding Language Queries to a Frozen Detector","primary_cat":"cs.CV","submitted_at":"2026-06-27T11:10:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HKVLM trains only an alignment hook to bind frozen LM query embeddings to frozen detector proposals via contrastive retrieval and bipartite assignment, yielding 50-90x grounding gains and reduced hallucinations on RefCOCO and POPE.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28696","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COMPASS: Grounding Composition-Intent Guidance in Unified Multimodal Models","primary_cat":"cs.AI","submitted_at":"2026-06-27T02:43:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COMPASS is a unified multimodal framework using a shared expert token τ_c to ground composition-intent for both perception and generation, backed by the new Comp-11 dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28643","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Obliviate: Erasing Concepts from Autoregressive Image Generation Models","primary_cat":"cs.CV","submitted_at":"2026-06-26T22:56:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Obliviate erases targeted concepts from autoregressive image generators via KL supervision on visual tokens over full trajectories, cutting nudity rates sharply on benchmarks while keeping general performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28551","ref_index":16,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DataComp-VLM: Improved Open Datasets for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-06-26T19:11:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"DataComp-VLM benchmark shows instruction-heavy data mixing outperforms filtering for VLM training, with DCVLM-Baseline achieving 63.6% on 33 tasks for 8B models (+5.4pp over FineVision).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28520","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detecting Clinical Hallucinations in LVLMs via Counterfactual Visual Grounding Uncertainty","primary_cat":"cs.CV","submitted_at":"2026-06-26T18:15:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A counterfactual visual grounding uncertainty method detects hallucinations in LVLMs on medical images, improving over baselines with interpretable evidence and cross-model transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28215","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HAT-4D: Lifting Monocular Video for 4D Multi-Object Interactions via Human-Agent Collaboration","primary_cat":"cs.CV","submitted_at":"2026-06-26T16:05:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HAT-4D presents an agentic VLM-plus-human-in-the-loop pipeline for monocular 4D multi-object interaction reconstruction and releases the MVOIK-4D benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28164","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EchoSonar-R: A Multi-View Reasoning-Enabled Model for Disease Classification and Report Generation in Echocardiography","primary_cat":"cs.CV","submitted_at":"2026-06-26T14:58:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EchoSonar-R is a multi-view VLM for echocardiography that jointly does disease classification and report generation via SFT followed by GRPO reinforcement learning, reporting accuracy gains on private and public data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28149","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Robust In-Context Segmentation via Concept Guidance","primary_cat":"cs.CV","submitted_at":"2026-06-26T14:46:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CG-ICS improves ICS robustness by using MLLM-proposed textual concepts scored via SAM3 and tree search plus visual exemplars to activate a frozen SAM3, claiming SOTA accuracy and lower variance across references.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28077","ref_index":3,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TextDS: Parameter-Efficient Representation Alignment for Scene Text Detection under Distribution Shifts","primary_cat":"cs.CV","submitted_at":"2026-06-26T13:41:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TextDS uses a data-efficient dual-encoder with SWLoRA and CSF to achieve competitive scene text detection robustness under distribution shifts and adverse conditions using 4.9M trainable parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28060","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReScene: Structured Indoor Scene Reconstruction from Multi-View Captures","primary_cat":"cs.CV","submitted_at":"2026-06-26T13:08:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReScene introduces HierView for view prioritization and Relation-Aware Assembly for scene graph fusion, reporting 17% lower Chamfer Distance and 26% lower LPIPS than prior baselines on ScanNet while running faster.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28016","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TempAct: Advancing Temporal Plausibility in Autoregressive Video Generation via Planner-Executor RL","primary_cat":"cs.CV","submitted_at":"2026-06-26T12:19:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TempAct applies hierarchical planner-executor RL with group exploration and multi-level rewards to improve temporal consistency in autoregressive video models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27974","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProMSA:Progressive Multimodal Search Agents for Knowledge-Based Visual Question Answering","primary_cat":"cs.CV","submitted_at":"2026-06-26T11:23:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProMSA is a progressive multimodal search agent for KB-VQA that iteratively selects search tools under budgets, trained via rejection-sampling SFT then TN-GSPO RL, reporting gains on E-VQA and InfoSeek over RAG baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27947","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Understanding How MLLMs Describe Artworks Using Token Activation Maps","primary_cat":"cs.CV","submitted_at":"2026-06-26T10:42:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Token Activation Maps applied to MLLM art descriptions reveal that visual grounding strength varies by token category, with better artist identification than title prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27944","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"It Lied to a Doctor to Buy Poison Ingredients: Quantifying Real-World Misuse of Phone-use Agents","primary_cat":"cs.MM","submitted_at":"2026-06-26T10:37:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Phone-use agents on real devices complete harmful tasks like procuring toxic precursors at 68.8% average rate with low refusal, including a documented case of deceiving a doctor for poison ingredients.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27876","ref_index":31,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpatialUAV: Benchmarking Spatial Intelligence for Low-Altitude UAV Perception, Collaboration, and Motion","primary_cat":"cs.CV","submitted_at":"2026-06-26T09:16:55+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpatialUAV releases a new multi-task benchmark for low-altitude UAV spatial intelligence and demonstrates that existing VLMs exhibit clear weaknesses in cross-view association and geometric reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27872","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S$^2$-VLA: State-Space Guided Vision-Language-Action Models for Long-Horizon Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:13:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"S²-VLA uses a state-space model to maintain a belief state that produces dynamic gating weights for fusing visual, language, and action features, claiming better long-horizon manipulation than 7B models with only 2B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27871","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LocalNav: Distilling Frontier VLMs and Embodied RL for On-Device Object Goal Navigation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:11:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Distillation from frontier VLMs plus E-RLVR regularization produces a 4B local model that achieves 34.5% SR on OVON while cutting inference latency by 82.8%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27828","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video-MME-Logical: A Controlled Diagnostic Benchmark for Video Temporal-Logical Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-26T08:12:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces Video-MME-Logical benchmark for controlled diagnostic evaluation of temporal-logical reasoning in MLLMs via five operations and 25 fine-grained tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27826","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NormAct: A Benchmark for Hidden Social Norm Compliance in Embodied Planning","primary_cat":"cs.AI","submitted_at":"2026-06-26T08:10:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NormAct shows MLLMs reach explicit goals in 67.3% of cases but comply with hidden norms in only 26.4%, with NormPerceptor raising task success from 24.2% to 46.7%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27663","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Direct Action-Head Injection of A Grounded 3D Point Unlocks Spatial and Task Generalization","primary_cat":"cs.RO","submitted_at":"2026-06-26T02:44:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Direct 3D point grounding injected into the action head via a two-layer MLP and adaptive layer norm boosts VLA success rates by 32-46 points on spatial and task perturbations in LIBERO-PRO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27514","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tessellating The Earth","primary_cat":"cs.CV","submitted_at":"2026-06-25T19:58:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TTE replaces fixed spherical bases with differentiable Voronoi partitions plus shared semantic tokens to create adaptive geolocation encoders that reach new SOTA on geospatial tasks and iNaturalist species classification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27500","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Aloe-Vision: Robust Vision-Language Models for Healthcare","primary_cat":"cs.CV","submitted_at":"2026-06-25T19:36:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Releases open medical LVLMs trained on a quality-filtered multimodal dataset, introduces CareQA-Vision benchmark from exams, reports performance gains over baselines, and flags adversarial vulnerabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27295","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LA4VLA: Learning to Act without Seeing via Language-Action Pretraining","primary_cat":"cs.RO","submitted_at":"2026-06-25T17:13:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LA4VLA creates a 33K language-action dataset from existing demos and shows that pretraining on language-action pairs before or alongside vision-language-action training boosts success rates in sim and real robot tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27187","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HarmVideoBench: Benchmarking Harmful Video Understanding in Large Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-06-25T15:50:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HarmVideoBench is a multi-layered benchmark for harmful video understanding in LVLMs with three hierarchical dimensions, and BCR is a method that raises average model performance from 61.7% to 84.4%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26872","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpatialFlow-GRPO: Where Spatial Credit Drives Image Editing","primary_cat":"cs.CV","submitted_at":"2026-06-25T10:58:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpatialFlow-GRPO adds region-level reward feedback and spatial alignment to Flow-GRPO-style RL for image editing, reporting gains on GEdit-Bench, ImgEdit-Bench, and a new MultiEditBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":100,"offset":0}}