{"total":30,"items":[{"citing_arxiv_id":"2606.30288","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VisReflect: Latent Visual Reflection for Fine-Grained Perception in Long Visual Context","primary_cat":"cs.CV","submitted_at":"2026-06-29T13:30:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VisReflect generates continuous latent visual reflections to emphasize relevant visual features and guide attention in LVLMs, yielding 4.1% gains on image benchmarks and 1.8% on video benchmarks with 44% less inference time than zooming methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05769","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Imagine Before You Predict: Interleaved Latent Visual Reasoning for Video Event Prediction","primary_cat":"cs.CV","submitted_at":"2026-06-04T06:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Future-L1 interleaves latent visual spans with text in MLLM decoding, trained on a custom Future-L1-50K dataset via LA-DAPO RL, and reports SOTA gains on FutureBench (61.0 to 85.4) and TwiFF-Bench (2.44 to 3.04).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03005","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MUSE: A Unified Agentic Harness for MLLMs","primary_cat":"cs.CV","submitted_at":"2026-06-02T01:24:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MUSE is a unified agentic harness that improves off-the-shelf MLLMs on visual spatial planning, perception, multimodal reasoning, and fine-grained discrimination benchmarks through structured execution modules and verifier-guided repair without model retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00562","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepLatent: Think with Images via Parallel Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepLatent introduces a parallel latent visual reasoning framework with learnable 2D tokens and continuous RL, trained via distillation then RL, plus a new 180K dataset, claiming SOTA benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26014","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"STORM: Internalized Modeling for Spatial-Temporal Reasoning in Video-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-25T16:33:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"STORM teaches LVLMs to internalize spatial-temporal reasoning via bounded latent trajectories trained with generated thought videos in two stages, improving accuracy on VideoMME, MVBench and similar benchmarks while lowering inference overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22012","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LatentOmni: Rethinking Omni-Modal Understanding via Unified Audio-Visual Latent Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-21T05:18:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LatentOmni proposes a latent-space cross-modal reasoning framework that uses feature-level supervision and Omni-Sync Position Embedding to align and synchronize audio-visual latents, supported by a new 35K interleaved reasoning dataset and showing gains over text CoT baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19342","ref_index":7,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Semantic-Enriched Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-19T04:29:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SLVR is a two-stage method that enriches region-centric latent representations with fine-grained attribute semantics and aligns them via M-GRPO across multiple queries on the same region, supported by new SLV-Set dataset and SV-QA benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18445","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What's Holding Back Latent Visual Reasoning?","primary_cat":"cs.CV","submitted_at":"2026-05-18T14:14:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Latent visual reasoning fails in current models because standard datasets make oracle latents uninformative and inference-time latents collapse away from useful representations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16961","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent Action Control for Reasoning-Guided Unified Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-16T12:23:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Latent Action Control learns unobserved action trajectories via variational alignment and GRPO to inject reasoning into flow-based image generation, yielding gains on compositional benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15198","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ATLAS: Agentic or Latent Visual Reasoning? One Word is Enough for Both","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ATLAS uses a single functional token to unify agentic and latent visual reasoning without image generation or external execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12374","ref_index":12,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-12T16:41:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GAP aligns visual latent reasoning in MLLMs via PCA-mapped decoder outputs, auxiliary visual supervision, and selective capacity-guided training, yielding top supervised performance on a 7B model with evidence that latents carry task-relevant signal.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"RMSNorm itself rescales a hidden vector by its root-mean-square magnitude: RMSNorm(x)=g⊙ x√︃ 1 𝑑 Í𝑑 𝑗=1 𝑥2 𝑗 +𝜖 ,(10) wheregis a learned scale vector. This placement explains why hidden-state norms can accumulate with depth. For a single residual update, write xℓ+1 =x ℓ +u ℓ,u ℓ =𝐹 ℓ (RMSNorm(x ℓ)) ,(11) where𝐹 ℓ denotes the attention or MLP branch. Then ∥xℓ+1 ∥2 2 = ∥xℓ ∥2 2 + ∥uℓ ∥2 2 +2 ⟨xℓ,u ℓ⟩ .(12) Pre-norm controls the input scale seen by𝐹ℓ, but it does not normalize the residual stream afteruℓ is added. Therefore, if residual updates have nonzero energy and are not consistently anti-aligned with the current residual stream, i.e., 𝔼[⟨xℓ,u ℓ⟩] ≈0,𝔼 \u0002 ∥uℓ ∥2 2 \u0003 >0,(13) then the expected squared norm accumulates: 𝔼 \u0002 ∥x𝐿∥2 2 \u0003 ≈𝔼 \u0002 ∥x0 ∥2 2 \u0003"},{"citing_arxiv_id":"2605.12163","ref_index":23,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Self-Consistent Latent Reasoning: Long Latent Sequence Reasoning for Vision-Language Model","primary_cat":"cs.CV","submitted_at":"2026-05-12T14:13:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCOLAR fixes information gain collapse in latent visual reasoning by generating independent auxiliary visual tokens via a detransformer, extending acceptable CoT length over 30x and delivering +14.12% gains on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11856","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniVLR: Unifying Text and Vision in Visual Latent Reasoning for Multimodal LLMs","primary_cat":"cs.CV","submitted_at":"2026-05-12T09:40:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UniVLR unifies textual and visual reasoning in multimodal LLMs by compressing reasoning traces and auxiliary images into visual latent tokens for direct inference without interleaved text CoT.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"variant of Qwen2.5-VL-7B, which rely primarily on explicit textual reasoning or direct answer generation. Second,tool-based visual reasoningmethods, including PixelReasoner [ 9] and DeepEyes [8], improve perception by invoking external visual operations such as cropping, zooming, or multi- turn image inspection. Third,visual latent reasoningmethods include LVR [ 11], Monet [12], SkiLa [13],CoVT[15] which introduce latent visual reasoning tokens while still preserving explicit textual reasoning channels. Unlike these methods, UniVLR removes the explicit reasoning text channel at inference time and performs reasoning through a compact sequence of unified visual latent tokens. Implementation Details.We instantiate UniVLR on top of Qwen2."},{"citing_arxiv_id":"2605.10426","ref_index":27,"ref_count":4,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CoWorld-VLA: Thinking in a Multi-Expert World Model for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-11T12:01:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoWorld-VLA extracts semantic, geometric, dynamic, and trajectory expert tokens from multi-source supervision and feeds them into a diffusion-based hierarchical planner, achieving competitive collision avoidance and trajectory accuracy on the NAVSIM v1 benchmark.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"[25] Shibo Hao, Sainbayar Sukhbaatar, DiJia Su, Xian Li, Zhiting Hu, Jason Weston, and Yuandong Tian. Training large language models to reason in a continuous latent space.arXiv preprint arXiv:2412.06769, 2024. [26] Arijit Ray, Ahmed Abdelkader, Chengzhi Mao, Bryan A Plummer, Kate Saenko, Ranjay Krishna, Leonidas Guibas, and Wen-Sheng Chu. Mull-tokens: Modality-agnostic latent thinking.arXiv preprint arXiv:2512.10941, 2025. [27] Bangzheng Li, Ximeng Sun, Jiang Liu, Ze Wang, Jialian Wu, Xiaodong Yu, Hao Chen, Emad Barsoum, Muhao Chen, and Zicheng Liu. Latent visual reasoning.arXiv preprint arXiv:2509.24251, 2025. [28] Xinqing Li, Xin He, Le Zhang, Min Wu, Xiaoli Li, and Yun Liu. A comprehensive survey on world models for embodied ai.arXiv preprint arXiv:2510.16732, 2025. [29] Tim Brooks, Bill Peebles, Connor Holmes, Will DePue, Yufei Guo, Leo Jing, David Schnurr,"},{"citing_arxiv_id":"2605.07106","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Retrieve, Integrate, and Synthesize: Spatial-Semantic Grounded Latent Visual Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-08T01:33:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RIS improves MLLM latent visual reasoning by retrieving spatial-semantic evidence, integrating it via attention bottlenecks, and synthesizing it with language transition tokens, yielding gains on V*, HRBench, MMVP, and BLINK benchmarks.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Since the model's reasoning behavior and decoding interface are largely shaped by language pretraining, effective latent visual reasoning must not only exploit the expressive capacity of a latent visual manifold Mvis, but also remain compatible with the vocabulary-aligned manifold Mvocab where pretrained reasoning circuits and language-grounded decoding are organized. Existing methods such as LVR[12] and Monet[13] take important steps by reconstructing visual tokens from latent states or generating continuous embeddings as intermediate visual thoughts, but they do not fully resolve this compatibility problem. In this work, we first analyzewhy existing latent visual reasoning methods remain ineffective despite forming distinct latent visual representations."},{"citing_arxiv_id":"2605.05997","ref_index":12,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"4DThinker: Thinking with 4D Imagery for Dynamic Spatial Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-07T10:48:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"4DThinker enables VLMs to perform dynamic spatial reasoning by thinking with 4D latent mental imagery using new fine-tuning and reinforcement learning methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"For a given question, the policyπθ samples a group of G candidate responses {yi}G i=1. Each response is evaluated using a composite reward function: R(yi) =λ acc Racc(yi) +λ fmt Rfmt(yi),(11) where Racc, Rfmt ∈ {0,1} reward answer correctness and the \"think with 4D\" format, respectively. The group-normalized advantages are then computed as follows: ˆAi = R(yi)−µ G σG , µ G = 1 G P j R(yj), σ G = q 1 G P j(R(yj)−µ G)2.(12) The policy is optimized via a clipped surrogate objective, regularized by the KL divergence against the frozen DIFT reference policy πref. A key modification over standard GRPO is that we restrict the policy gradient to the index set T (i) txt ={1, . . . ,|y i|} \\ T (i) lat , whichexplicitly excludes all latent token positions. This is toavoid destabilizing gradient noise caused by the mismatch between continuous"},{"citing_arxiv_id":"2604.21027","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:18:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HypEHR is a hyperbolic embedding model for EHR data that uses Lorentzian geometry and hierarchy-aware pretraining to answer clinical questions nearly as well as large language models but with much smaller size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20328","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HyLaR: Hybrid Latent Reasoning with Decoupled Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-22T08:22:23+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"verbalize, leaving the model to rely on linguistic priors rather than grounded visual facts. To mitigate this, recent studies have explored two primary alternatives. The first is the \"Think-with-Images\" paradigm (Fig. 1(B)), which relies on external tools to re-perceive the image, yet introduces rigid bottlenecks and inference latency [12,44,48]. The second alternative, including recent pioneering works, such as LVR [17], SkiLa [28] and Monet [32], shifts reasoning into a continuous latent space to preserve visual fidelity. While promising, optimizing the resulting hybrid discrete-continuous action space remains profoundly challenging. Current approaches predominantly relying on supervised fine-tuning (SFT) or vanilla re- inforcement learning (RL) often yield sub-optimal optimization, as conventional"},{"citing_arxiv_id":"2604.11025","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Test-time Scaling over Perception: Resolving the Grounding Paradox in Thinking with Images","primary_cat":"cs.CV","submitted_at":"2026-04-13T05:49:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TTSP resolves the Grounding Paradox by treating perception as a scalable test-time process that generates, filters, and iteratively refines multiple visual exploration traces, outperforming baselines on high-resolution and multimodal reasoning tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"However, applying test-time scaling to visual reasoning is not straightforward. Existing methods are largely designed for text- based reasoning, where diversity across trajectories typically re- flects alternative chains of logic over a fixed and fully observed input. In contrast, visual reasoning involves uncertainty not only in how to reason, but also in what has been perceived [18, 27]. Differ- ent trajectories may inspect different regions, collect different pieces of visual evidence, and even condition subsequent reasoning on incompatible perceptual observations [19, 31]. As a result, naively transplanting self-consistency or majority voting into multimodal settings is inadequate: without explicitly modeling the reliability,"},{"citing_arxiv_id":"2604.10500","ref_index":32,"ref_count":3,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Visual Enhanced Depth Scaling for Multimodal Latent Reasoning","primary_cat":"cs.CV","submitted_at":"2026-04-12T07:14:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Visual replay module and adaptive depth scaling improve multimodal latent reasoning, reaching SOTA benchmarks with faster inference than explicit chain-of-thought methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Llava-onevision: Easy visual task transfer. Transactions on Machine Learning Research, 2024. 6, 7 [30] Bangzheng Li, Ximeng Sun, Jiang Liu, Ze Wang, Jialian Wu, Xiaodong Yu, Hao Chen, Emad Barsoum, Muhao Chen, and Zicheng Liu. Latent visual reasoning.arXiv preprint arXiv:2509.24251, 2025. 3 [31] Bangzheng Li et al. Latent visual reasoning.arXiv preprint arXiv:2509.24251, 2025. 7 [32] Chengzu Li, Wenshan Wu, Huanyu Zhang, Yan Xia, Shaoguang Mao, Li Dong, Ivan Vuli'c, and Furu Wei. Imag- ine while reasoning in space: Multimodal visualization-of- thought.arXiv preprint arXiv:2501.07542, 2025. 3 [33] Yi Li, Hualiang Wang, Xinpeng Ding, Haonan Wang, and Xiaomeng Li. Token activation map to visually explain multi- modal llms. InProceedings of the IEEE/CVF International"},{"citing_arxiv_id":"2604.09757","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MedLVR: Latent Visual Reasoning for Reliable Medical Visual Question Answering","primary_cat":"cs.CV","submitted_at":"2026-04-10T16:03:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MedLVR interleaves latent visual reasoning segments in autoregressive decoding and uses two-stage training to raise average medical VQA accuracy from 48.3% to 53.4% over a Qwen2.5-VL-7B backbone on OmniMedVQA and five other benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Recent studies suggest that intermediate reasoning can also be carried out in a continuous latent space [28]-[30]. In this view, the model alternates between language decoding and latent updates, using latent states as an internal substrate to preserve and refresh grounding constraints throughout gener- ation, rather than relying on textual tokens alone [18], [31]. Motivated by this mismatch, we introduce a latent visual reasoning pathway for medical visual question answering (VQA), in which latent updates serve as an explicit mecha- nism for maintaining grounding constraints during decoding (Fig. 1). To stabilize latent reasoning and keep it aligned with question-relevant cues, we optimize the model with a"},{"citing_arxiv_id":"2604.08879","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GRASP: Grounded CoT Reasoning with Dual-Stage Optimization for Multimodal Sarcasm Target Identification","primary_cat":"cs.CL","submitted_at":"2026-04-10T02:38:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRASP improves multimodal sarcasm target identification by anchoring visual regions in grounded chain-of-thought reasoning and using dual-stage optimization on a new balanced dataset.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"instructions, where within a dedicated Think module, the model first extracts salient visual anchors via bounding boxes and core text spans for fine-grained perception, and then explicitly aligns them to mitigate cross-modal incongruities. By directly referencing visual regions during reasoning, rather than compressing them into textual representations, GRASP effectively achieves \"Thinking with Images\" [17, 27]. It preserves fine-grained spatial details and fosters cooperation between visual grounding and abstract cogni- tive reasoning. Following this explicit reasoning process, the model yields the classification label, textual targets, and precise bounding box coordinates within a final Answer module. To endow GRASP with robust reasoning capabilities, we devise the noveldual-stage"},{"citing_arxiv_id":"2604.08545","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Act Wisely: Cultivating Meta-Cognitive Tool Use in Agentic Multimodal Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T17:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HDPO reframes tool efficiency as a conditional objective within accurate trajectories, enabling Metis to reduce tool invocations by orders of magnitude while raising reasoning accuracy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Inspired by the success of chain-of-thought in LLMs, recent MLLMs introduce explicit intermediate reasoning to handle more complex multimodal problems [11, 40]. These models generate step-by-step textual rationales before producing final answers, leading to improvements on complex multimodal reasoning tasks [30, 44, 47, 49]. More recently, several works explore latent visual reasoning [13, 29, 31] by inserting continuous visual representations into the reasoning process, which further improves spatial reasoning ability [54]. However, despite these advances, most existing MLLMs [18, 25] remain passive in that they mainly interpret inputs and generate responses, without actively invoking external tools for retrieval or computation, which limits"},{"citing_arxiv_id":"2604.07518","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Decompose, Look, and Reason: Reinforced Latent Reasoning for VLMs","primary_cat":"cs.CL","submitted_at":"2026-04-08T18:52:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DLR is a new reinforced latent reasoning method for VLMs that decomposes queries, uses continuous visual latents, and outperforms text-only and multimodal CoT baselines on vision-centric benchmarks with better interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06912","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Q-Zoom: Query-Aware Adaptive Perception for Efficient Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-08T10:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Q-Zoom achieves up to 4.39x inference speedup in high-resolution MLLM scenarios via query-aware gating and region localization, matching or exceeding baseline accuracy on document and high-res benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"reliable reward signals. More critically, during inference, these methods inadvertently shift the computational burden from the vision encoder to the language model. They rely on lengthy Chain-of-Thought (CoT) decoding stages to \"think\" prior to answering, which dramatically inflates inference latency. Although recent latent thinking paradigms [61], [62] attempt to compress these reasoning trajectories in the hidden space, they inevitably impose a strict ceiling on the model's ultimate perceptual performance. III. METHOD A. Preliminaries In widely adopted LLaV A-style architectures, a Multimodal Large Language Model (MLLM) typically comprises three core components: a vision encoderE v, a vision-language"},{"citing_arxiv_id":"2604.06777","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Walk the Talk: Bridging the Reasoning-Action Gap for Thinking with Images via Multimodal Agentic Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-04-08T07:48:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAPO improves multimodal chain-of-thought reasoning by requiring explicit textual descriptions of visual tool results and using a novel advantage estimator that combines semantic alignment with task rewards.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Latent sketchpad: Sketching visual thoughts to elicit multimodal reasoning in mllms. arXiv preprint arXiv:2510.24514, 2025. [57] Qixun Wang, Yang Shi, Yifei Wang, Yuanxing Zhang, Pengfei Wan, Kun Gai, Xianghua Ying, and Yisen Wang. Monet: Reasoning in latent visual space beyond images and language.arXiv preprint arXiv:2511.21395, 2025. 13 APREPRINT [58] Bangzheng Li, Ximeng Sun, Jiang Liu, Ze Wang, Jialian Wu, Xiaodong Yu, Hao Chen, Emad Barsoum, Muhao Chen, and Zicheng Liu. Latent visual reasoning.arXiv preprint arXiv:2509.24251, 2025. [59] Shuai Dong, Siyuan Wang, Xingyu Liu, Chenglin Li, Haowen Hou, and Zhongyu Wei. Interleaved latent visual reasoning with selective perceptual modeling.arXiv preprint arXiv:2512."},{"citing_arxiv_id":"2604.03307","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"V-Reflection: Transforming MLLMs from Passive Observers to Active Interrogators","primary_cat":"cs.CV","submitted_at":"2026-03-31T03:57:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"V-Reflection introduces a think-then-look mechanism where MLLM latent states actively interrogate visual features via two-stage distillation from a box-guided teacher to a dynamic autoregressive student, narrowing the fine-grained perception gap on benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"execute a purely end-to-end visual search driven by its latent reasoning process. for greater flexibility and more condensed reasoning chains [5]. Building on this direction, several works have extended latent reasoning to MLLMs. Current methods primarily align these states with static encoder features or auxiliary signals, such as helper images [ 34, 24], annotated boxes [ 12] or fine-grained perceptual priors from models [17]. However, these introduced supervision signals necessitate costly auxiliary data and critically overlook the dynamic, top-down guidance inherent in the LLM's own evolving latent states. In contrast,V-Reflectionreplaces such passive alignment with an active interrogation paradigm, utilizing hidden states as dynamic probes to autonomously"},{"citing_arxiv_id":"2601.18664","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"S$^2$GR: Stepwise Semantic-Guided Reasoning in Latent Space for Generative Recommendation","primary_cat":"cs.IR","submitted_at":"2026-01-26T16:40:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"S²GR adds stepwise thinking tokens with contrastive supervision on codebook clusters to balance computational focus and ground reasoning paths in generative recommendation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.06803","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Forest Before Trees: Latent Superposition for Efficient Visual Reasoning","primary_cat":"cs.CL","submitted_at":"2026-01-11T08:30:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Laser reformulates visual reasoning via Dynamic Windowed Alignment Learning to maintain latent superposition of global features, delivering 5.03% average gains over Monet and over 97% fewer inference tokens on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.10226","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent Chain-of-Thought World Modeling for End-to-End Driving","primary_cat":"cs.CV","submitted_at":"2025-12-11T02:22:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LCDrive unifies chain-of-thought reasoning and action selection for end-to-end driving by interleaving action-proposal tokens and latent world-model tokens that predict action outcomes, yielding faster inference and better trajectories than text-based or non-reasoning baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}