{"total":735,"items":[{"citing_arxiv_id":"2606.27872","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S$^2$-VLA: State-Space Guided Vision-Language-Action Models for Long-Horizon Manipulation","primary_cat":"cs.RO","submitted_at":"2026-06-26T09:13:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"S²-VLA uses a state-space model to maintain a belief state that produces dynamic gating weights for fusing visual, language, and action features, claiming better long-horizon manipulation than 7B models with only 2B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27187","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HarmVideoBench: Benchmarking Harmful Video Understanding in Large Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-06-25T15:50:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HarmVideoBench is a multi-layered benchmark for harmful video understanding in LVLMs with three hierarchical dimensions, and BCR is a method that raises average model performance from 61.7% to 84.4%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24187","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Fast and Effective Long Video Understanding of Multimodal Large Language Models via Adaptive Quasi-Gaussian Sampling","primary_cat":"cs.CV","submitted_at":"2026-06-23T06:13:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AdaQ is a training-free adaptive quasi-Gaussian sampling method for keyframe selection that improves long-video understanding in MLLMs and can outperform GPT-4o with 64 frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20728","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VTOS: Learning to Orchestrate Vision Tools by Co-Searching Solutions and Observers","primary_cat":"cs.CV","submitted_at":"2026-06-17T04:52:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VTOS jointly searches solution and observer programs to adaptively orchestrate vision tools, outperforming static pipelines on dense object counting and zero-shot plant disease segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12984","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillChain: Closing the Loop on Skill Evolution for Image-Based E-Commerce AI Assistants","primary_cat":"cs.CL","submitted_at":"2026-06-11T07:21:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SkillChain automates skill lifecycle for e-commerce image AI assistants via creator, optimizer, and refiner stages, leading to improved response quality and user engagement in production A/B tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11324","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Embodied-R1.5: Evolving Physical Intelligence via Embodied Foundation Models","primary_cat":"cs.RO","submitted_at":"2026-06-09T18:07:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Embodied-R1.5 is an 8B EFM achieving SOTA on 16 of 24 embodied VLM benchmarks, fine-tunable to outperform leading VLAs, with claimed zero-shot real-robot generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08992","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpaceVLN: A Zero-Shot Vision-and-Language Navigation Agent with Online Spatial Cognitive Memory and Reasoning","primary_cat":"cs.RO","submitted_at":"2026-06-08T03:42:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpaceVLN proposes a stagewise closed-loop framework using Spatial Cognitive Memory and Spatial-CoT for zero-shot vision-and-language navigation and object-goal navigation, reporting SOTA results on R2R-CE, RxR-CE, GN-Bench, and HM3D-OVON plus real-robot tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04166","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"End-to-End Text Line Detection and Ordering","primary_cat":"cs.CV","submitted_at":"2026-06-02T19:29:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Orli is an autoregressive image-to-sequence model that jointly detects text lines and determines their reading order on historical documents via chord-frame baselines, trained on 196k pages across ten scripts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02569","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AdaCodec: A Predictive Visual Code for Video MLLMs","primary_cat":"cs.CV","submitted_at":"2026-06-01T17:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AdaCodec introduces a predictive visual code that cuts visual token use in video MLLMs by sending full frames only on high predictive cost and otherwise encoding inter-frame changes as P-tokens, yielding better benchmark scores at lower budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02673","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Visual Graph Scaffolds for Structural Reasoning in Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-06-01T12:17:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Visual graph mind maps outperform text-flattened versions as internal reasoning scaffolds for LLMs on multi-hop QA, with the advantage holding after fine-tuning and distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01414","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent Skills Should Go Beyond Text: The Case for Visual Skills","primary_cat":"cs.CV","submitted_at":"2026-05-31T19:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes that reusable agent skills should incorporate visual elements alongside text, introduces three forms of visual skills and an automatic conversion system, and reports better performance on GUI and visual-centric tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01223","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Connecting the Dots: Benchmarking Reflective Memory in Long-Horizon Dialogue","primary_cat":"cs.CL","submitted_at":"2026-05-31T13:16:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RefMem-Bench benchmarks reflective memory in dialogue with 26K instances across eight dimensions, and REMIND improves model accuracy via hierarchical evidence retrieval, grounding, and abstraction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01079","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Chameleon: Style-Content Disentangled Framework for Cross-Domain Object Compositing","primary_cat":"cs.CV","submitted_at":"2026-05-31T07:54:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chameleon proposes the first large-scale cross-domain compositing dataset and a disentangled encoder plus gated diffusion transformer that outperforms prior in-domain and cross-domain methods on plausibility and fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00959","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Understanding Modality Interaction in Multimodal Language Models via Partial Information Decomposition","primary_cat":"cs.AI","submitted_at":"2026-05-31T02:29:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PID applied to MLLMs identifies task-specific modality interaction profiles that generalize across models, extend to tri-modal cases, and yield initial performance gains via reweighting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00829","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Right Inference Strategy Is All You Need: Nearly Training-Free Domain-Wise Inference for EgoCross Challenge","primary_cat":"cs.CV","submitted_at":"2026-05-30T17:55:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Domain-specific prompting and minimal fine-tuning on Qwen3-VL-4B yields 66.98% accuracy on EgoCross egocentric video QA with only 20 training samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00828","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoboStressBench: Benchmarking VLM Robustness to Physical Visual Stress in Embodied Scenes","primary_cat":"cs.CV","submitted_at":"2026-05-30T17:55:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoboStressBench decomposes visual stress into four physically grounded dimensions to benchmark VLM robustness in embodied scenes and proposes a stress-aware solver.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00793","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MBench: A Comprehensive Benchmark on Memory Capability for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-30T16:17:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MBench is a new benchmark that quantifies long-term memory in video world models via three hierarchical consistency dimensions evaluated on curated real videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00622","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MM-Snowball: Evaluating and Mitigating Hallucination Snowballing in Multimodal Multi-Turn Dialogue","primary_cat":"cs.CV","submitted_at":"2026-05-30T08:53:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MM-Snowball benchmark diagnoses hallucination snowballing in multi-turn MLLM dialogues; CAVR mitigates it via dual visual rectification at representation and logit levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00620","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlowNar: Scalable Streaming Narration for Long-Form Videos","primary_cat":"cs.CV","submitted_at":"2026-05-30T08:51:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlowNar achieves bounded memory and 3x higher throughput for streaming narration on Ego4D, EgoExo4D, and EpicKitchens100 by combining dynamic historical context removal with a Cross Linear Attentive Memory module.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00616","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pause and Think: A Dataset and Benchmark for Video-Grounded Assistive Action Suggestion","primary_cat":"cs.CV","submitted_at":"2026-05-30T08:33:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces pause-and-think-T dataset and pause-and-think-B benchmark for video-grounded assistive action suggestion, enabling a 4B VLM to match larger models on reasoning tasks and generalize to EgoThink and TempCompass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00562","ref_index":97,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepLatent: Think with Images via Parallel Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepLatent introduces a parallel latent visual reasoning framework with learnable 2D tokens and continuous RL, trained via distillation then RL, plus a new 180K dataset, claiming SOTA benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31598","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Linear Scaling Video VLMs for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StateKV is an inference-time technique that replaces quadratic self-attention prefill in video VLMs with a fixed-capacity importance-based recurrent state, keeping accuracy near full attention on long-video benchmarks without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31595","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Global Motion with Compact Gaussians for Feed-Forward 4D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"C4G introduces compact timestamp-conditioned Gaussian query tokens that aggregate full temporal context to decode 3D Gaussians with timestamp-modulated positions for feed-forward 4D reconstruction from monocular video, plus a diffusion-based rendering module and extension to 4D feature fields.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31513","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Personalize Your Large Vision-language Models With In-context Prompt Tuning","primary_cat":"cs.CV","submitted_at":"2026-05-29T16:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ICPT adds an adaptive-length projection module and two geometric regularizations to enable efficient, high-accuracy personalization of LVLMs across complex multi-concept tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31460","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Device Robotic Planning: Eliminating Inference Redundancy for Efficient Decision-Making","primary_cat":"cs.RO","submitted_at":"2026-05-29T15:51:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"REIS reduces inference redundancy in embodied robotic planning via lightweight gating and routing while preserving task performance on ALFRED and real robots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31457","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VisionPulse: Dynamic Visual Sparsity for Efficient Multimodal Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-29T15:51:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VisionPulse is a step-wise visual token pruning method for LMMs that retains 5% of tokens per step, shortens reasoning traces by 11.2%, and maintains accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31429","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YARD: Y-Architecture Register Decoding for Efficient Hallucination Mitigation in Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-29T15:23:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"YARD is a training-free method using Y-shaped decoder architecture and register tokens to improve contrastive decoding for hallucination reduction in LVLMs with lower latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31251","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ERGeoBench:A Comprehensive Benchmark for Embodied Reasoning and Geo-localization in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-29T12:49:17+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ERGeoBench is a new diagnostic benchmark evaluating MLLMs on four capabilities in three progressive embodied geo-localization settings, finding that models handle high-level semantics but struggle with fine-grained perception and metric localization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31174","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detect in Any Scene: An Agentic Framework for Object Detection with Experience-Aware Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-29T11:41:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DetAS-X uses an MLLM agent to adaptively compose detection workflows from restoration modules and expert detectors, enhanced by self-evolving experience harvesting, achieving substantial F1 score gains on challenging benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31093","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cross-Modal Clinical Knowledge Integration for Mammography Report Generation","primary_cat":"cs.CV","submitted_at":"2026-05-29T10:04:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MammoRG integrates cross-modal prior clinical knowledge and BI-RADS terminology via two-stage training to generate mammography reports with higher clinical consistency than prior direct image-to-text methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31075","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Task-Focused Memorization for Multimodal Agents","primary_cat":"cs.CV","submitted_at":"2026-05-29T09:43:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TaskMem uses RL in two phases to learn a task-focused memorization policy for multimodal agents, yielding 5.3-7.0% VQA accuracy gains on reformulated streaming benchmarks from VideoMME, EgoLife, and EgoTempo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30834","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hide-and-Seek in Trajectories: Discovering Failure Signals for VLA Runtime Monitoring","primary_cat":"cs.RO","submitted_at":"2026-05-29T04:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hide-and-Seek uses contrastive objectives on trajectories to localize failure signals in VLA models from trajectory-level supervision alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":79,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30698","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing Before Agreeing: Aligning Multi-Agent Consensus with Visual Evidence","primary_cat":"cs.CV","submitted_at":"2026-05-29T00:45:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EAGLE is a new evidence-aligned framework that improves multi-agent VQA by enforcing consistency in visual grounding across agents, achieving best average performance on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30639","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PInVerify: An Offline Embodied Benchmark for Active Instance Verification","primary_cat":"cs.CV","submitted_at":"2026-05-28T22:42:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PInVerify is a new offline embodied benchmark for active instance verification that supplies multi-view captures and 6-sector navigation topology, with MLLM baselines reaching 85.6% after fine-tuning but showing no reliable benefit from tested next-best-view strategies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30561","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLM3: Vision Language Models Are Native 3D Learners","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:48:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Standard VLMs achieve expert-level 3D performance on depth estimation, pose estimation, and object understanding via three simple techniques without architecture changes or regression losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30341","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GPIC: A Giant Permissive Image Corpus for Visual Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:59:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GPIC is a new 28-trillion-pixel permissively licensed image corpus with 100M training examples for visual generative modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30311","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Archon: A Unified Multimodal Model for Holistic Digital Human Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:53:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Archon unifies seven modalities via modality-specific tokenizers and an autoregressive backbone pretrained on 72 tasks, plus a 4x-efficient video reparameterization and stepwise 'Thinking in Modality' procedure, and reports superior or comparable results on digital-human tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30307","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounded 3D-Aware Spatial Vision-Language Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:51:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GR3D is a VLM that combines explicit 2D, implicit 2D, and monocular 3D grounding mechanisms to improve performance on spatial understanding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30265","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LoMo: Local Modality Substitution for Deeper Vision-Language Fusion","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:27:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoMo is a lightweight data curation technique that locally substitutes text with images in prompts to enforce cross-modal invariance, yielding 2.67-2.82 point gains over standard SFT on two VLMs across 13 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30170","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unveiling the Visual Counting Bottleneck in Vision-Language Models","primary_cat":"cs.MM","submitted_at":"2026-05-28T16:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLMs fail at visual counting extrapolation because they cannot project visual magnitudes onto symbolic tokens, despite intact perceptual representations, supporting a fractured magnitude hypothesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30161","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Far Looks Up: Probing Spatial Representation in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-28T16:18:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VLMs exhibit consistent vertical-distance entanglement in embeddings from perspective bias in natural images, producing accuracy gaps that a new synthetic benchmark SpatialTunnel exposes as model-intrinsic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30045","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenEraser: Generalizable Video Object Removal via Balanced Text-Mask Guidance and Decoupled Locator-Preserver","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:58:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenEraser proposes MC-MoE with bipartite text guidance, LD-CFG fusion, and a decoupled locator-preserver architecture for generalizable video object and effect removal, claiming 2.16 dB and 1.44 dB gains on ROSE and VOR-Eval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30029","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RAISE: RAG Design as an Architecture Search Problem","primary_cat":"cs.AI","submitted_at":"2026-05-28T14:52:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RAISE is a standardized benchmark for RAG hyperparameter optimization that evaluates 13 search algorithms across seven datasets and finds performance is highly task-dependent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29894","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Train the Agent, Not the Expert: Learning to Harness Heterogeneous Experts for Multi-Turn Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-28T13:18:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VisHarness learns a reinforcement-learned policy to harness specialized visual experts via multi-turn interactions and dynamic visual memory archiving, outperforming general models on four visual reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29858","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Masked Diffusion Vision-Language Models for Temporal Action Localization","primary_cat":"cs.CV","submitted_at":"2026-05-28T12:39:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Adapts MDVLMs to TAL via planned training objective and step-level IoU reward, reporting gains over autoregressive baselines on ActivityNet and THUMOS datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29850","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MIRAGE: Adaptive Multimodal Gating for Whole-Brain fMRI Encoding","primary_cat":"cs.LG","submitted_at":"2026-05-28T12:30:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIRAGE uses adaptive multimodal gating on native multimodal backbones plus a transformer encoder to achieve state-of-the-art whole-brain fMRI prediction for naturalistic audiovisual stimuli, outperforming post-hoc unimodal aggregation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29833","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniMatBench: A Human-Calibrated Multimodal Reasoning Benchmark Across 19 Materials Science Subfields","primary_cat":"cs.AI","submitted_at":"2026-05-28T12:12:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniMatBench is a new human-calibrated benchmark for multimodal materials-science reasoning that reveals the best evaluated MLLM scores only 0.372 overall.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00123","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CardioLens: Revealing the Clinical Reality Gap of MLLMs via Multi-Sequence Cardiac MRI Evaluations","primary_cat":"cs.CV","submitted_at":"2026-05-28T11:03:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CardioLens is a leakage-resistant CMR testbed of 473k slices and 13k QA pairs showing current MLLMs exhibit a large clinical reality gap with category-collapse failures on real workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29657","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OccamToken: Efficient VLM Inference with Training-Free and Budget-Adaptive Token Pruning","primary_cat":"cs.CV","submitted_at":"2026-05-28T09:20:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OccamToken replaces absolute token ranking with register-anchored relative evidence testing to enable adaptive, high-ratio visual token pruning in VLMs while preserving most accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}