{"total":11,"items":[{"citing_arxiv_id":"2606.31257","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Decodable Is Not Grounded: A Vision-Ablation Arbiter for VLM Spatial Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-30T07:33:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"A blank-image ablation test reveals that high probe accuracy on VLM spatial reasoning frequently reflects priors or inverted signs rather than image grounding, with horizontal grounded, vertical prior, and depth inverted.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23771","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PhotoFlow: Agentic 3D Virtual Photography Missions","primary_cat":"cs.CV","submitted_at":"2026-05-22T15:40:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PhotoFlow is a closed-loop agent framework that searches for camera parameters in 3D scenes according to language intent and outperforms one-shot, reflection, and random baselines on the new VPhotoBench of 47 scenes and 141 missions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23141","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VisAnalog: A Diagnostic Suite for Visual Concept Transfer on Natural Images","primary_cat":"cs.CV","submitted_at":"2026-05-22T01:43:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VisAnalog is a new controlled benchmark showing VLMs substantially underperform humans on visual concept transfer under one- to four-step deterministic transformations, with relation inference as the main failure mode.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20165","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CaMo: Camera Motion Grounded Evaluation and Training for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-19T17:50:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes Spatial Narrative Score (SNS) evaluation for VLMs' camera motion understanding and introduces CaMo model achieving consistent performance on SNS and direct QA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16713","ref_index":20,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeoWorld-VLM: Geometry from World Models for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-15T23:52:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoWorld-VLM aligns VLM image features with intermediate representations from camera-conditioned world models via fine-tuning only the encoder and projector, yielding ~4% gains on What'sUp and VSR spatial benchmarks across two VLM backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.27437","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SpatialStack: Layered Geometry-Language Fusion for 3D VLM Spatial Reasoning","primary_cat":"cs.CV","submitted_at":"2026-03-28T22:49:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpatialStack improves 3D spatial reasoning in vision-language models by stacking and synchronizing multi-level geometric features with the language backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.07632","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Test-Time Matching: Unlocking Compositional Reasoning in Multimodal Models","primary_cat":"cs.AI","submitted_at":"2025-10-09T00:00:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces group matching score for better evaluation of compositional reasoning and Test-Time Matching (TTM) algorithm for unsupervised self-improvement in multimodal models, achieving SOTA gains including surpassing GPT-4.1 and estimated human performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.19207","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Long Story Short: Disentangling Compositionality and Long-Caption Understanding in Contrastive VLMs","primary_cat":"cs.CV","submitted_at":"2025-09-23T16:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical study shows bidirectional but sensitive relationship between compositionality and long-caption understanding in VLMs, promoted by high-quality grounded data and affected by architectural choices like frozen positional embeddings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.13998","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Embodied-R1: Reinforced Embodied Reasoning for General Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2025-08-19T16:50:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Embodied-R1 uses a pointing-centric representation and reinforced fine-tuning on a 200K dataset to achieve state-of-the-art results on embodied benchmarks plus 56.2% success in SIMPLEREnv and 87.5% on real XArm tasks without task-specific training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.09965","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Reinforcing Spatial Reasoning in Vision-Language Models with Interwoven Thinking and Visual Drawing","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:41:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VILASR integrates visual drawing operations with reasoning in LVLMs via cold-start synthetic training, reflective rejection sampling, and reinforcement learning, yielding an 18.4% average gain on spatial reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.07557","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AutoSpatial: Visual-Language Reasoning for Social Robot Navigation through Efficient Spatial Reasoning Learning","primary_cat":"cs.RO","submitted_at":"2025-03-10T17:27:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AutoSpatial improves VLM spatial reasoning for social navigation by combining minimal manual supervision with auto-labeled VQA pairs and hierarchical training, showing gains up to 20.5% in action prediction over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}