{"total":19,"items":[{"citing_arxiv_id":"2605.30542","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Physically Viable World Models: A Case for Query-Conditioned Embodied AI","primary_cat":"cs.AI","submitted_at":"2026-05-28T20:18:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Embodied AI requires query-conditioned world models that select the simplest physical abstraction sufficient to answer intervention queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30339","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Benchmarking Single-Factor Physical Video-to-Audio Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:59:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FlatSounds benchmark shows state-of-the-art V2A models rely more on text captions than visual input for physical and semantic accuracy, with captions improving correctness but degrading temporal alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29585","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"World Models in Words: Auditing Physical State-Transition Commitments in Vision-Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-28T08:29:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WMW audits VLMs by requiring typed physical state-transition traces and using a verifier to detect inconsistencies missed by answer-only evaluation, with TraceBank as a released resource of synthetic scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20576","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"$\\Delta$ynamics: Language-Based Representation for Inferring Rigid-Body Dynamics From Videos","primary_cat":"cs.CV","submitted_at":"2026-05-20T00:23:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A vision-language framework generates text-based rigid-body scene configurations from videos using motion reasoning and optical flow, reporting 0.30 IoU on CLEVRER (7x over baselines) and transfer to 235 real videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18746","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ESI-Bench: Towards Embodied Spatial Intelligence that Closes the Perception-Action Loop","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:59:02+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16713","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeoWorld-VLM: Geometry from World Models for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-15T23:52:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15298","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PhysBrain 1.0 Technical Report","primary_cat":"cs.RO","submitted_at":"2026-05-14T18:11:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PhysBrain 1.0 extracts scene elements, spatial dynamics, actions and depth relations from human egocentric video to create QA supervision for VLMs, then transfers the resulting physical priors to VLA policies via capability-preserving adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15185","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Quantitative Video World Model Evaluation for Geometric-Consistency","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PDI-Bench computes 3D projective residuals from segmented and tracked points to quantify geometric inconsistency in AI-generated videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07568","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Tracing the Arrow of Time: Diagnosing Temporal Information Flow in Video-LLMs","primary_cat":"cs.CV","submitted_at":"2026-05-08T10:40:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Temporal information in Video-LLMs is encoded well by video-centric encoders but disrupted by standard projectors; time-preserved MLPs plus AoT supervision yield 98.1% accuracy on arrow-of-time and gains on other temporal tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"InProceedings of the IEEE/CVF conference on computer vision and pattern recognition, pages 24185-24198, 2024. [12] Zesen Cheng, Sicong Leng, Hang Zhang, Yifei Xin, Xin Li, Guanzheng Chen, Yongxin Zhu, Wenqi Zhang, Ziyang Luo, Deli Zhao, et al. Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms.arXiv preprint arXiv:2406.07476, 2024. [13] Wei Chow, Jiageng Mao, Boyi Li, Daniel Seita, Vitor Guizilini, and Yue Wang. Physbench: Benchmarking and enhancing vision-language models for physical world understanding.arXiv preprint arXiv:2501.16411, 2025. [14] Daniel Cores, Michael Dorkenwald, Manuel Mucientes, Cees GM Snoek, and Yuki M Asano. Lost in time: A new temporal benchmark for videollms."},{"citing_arxiv_id":"2605.04515","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"From Priors to Perception: Grounding Video-LLMs in Physical Reality","primary_cat":"cs.CV","submitted_at":"2026-05-06T05:48:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Video-LLMs fail physical reasoning due to semantic prior dominance rather than perception deficits; a new programmatic adversarial curriculum and visual-anchored reasoning chain enable substantial gains via standard LoRA fine-tuning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"standard LoRA [16] fine-tuning empowers models to break prior constraints and achieve a substantial leap in genuine physical reasoning capabilities. 2 Related Work 2.1 Physical Reasoning in Video-LLMs Although Video-LLMs demonstrate proficiency in general understanding [21, 23, 8, 37, 46], their fine-grained physical reasoning remains limited. Interaction-heavy benchmarks [11, 15] accentuate these deficits, revealing failures in grounding physical principles like rigid-body dynamics and spatial contact. Fundamentally, these failures arise because models prioritize internal expectations over visual evidence. Existing literature typically interprets this visual neglect through multimodal hallucinations [38, 45, 30]; for instance, NOAH [17] identifies an inductive bias favoring storyline"},{"citing_arxiv_id":"2604.21873","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Grounding Video Reasoning in Physical Signals","primary_cat":"cs.CV","submitted_at":"2026-04-23T17:17:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new benchmark converts video clips into shared grounded event records and tests models across physics, semantic, and control prompts under original, shuffled, ablated, and masked conditions, finding selective robustness and weak spatial performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21510","ref_index":96,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OptiVerse: A Comprehensive Benchmark towards Optimization Problem Solving","primary_cat":"cs.CL","submitted_at":"2026-04-23T10:12:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OptiVerse is a new benchmark spanning neglected optimization domains that shows LLMs suffer sharp accuracy drops on hard problems due to modeling and logic errors, with a Dual-View Auditor Agent proposed to improve performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20183","ref_index":80,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Dual-Cluster Memory Agent: Resolving Multi-Paradigm Ambiguity in Optimization Problem Solving","primary_cat":"cs.CL","submitted_at":"2026-04-22T04:55:31+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"from linear constraint expressions or the objective function. 2. Verifying bounds and types to prevent infeasibility …… Modeling Cluster Example Coding Cluster Example Pitfall: 1. Do not proceed without validating solver initialization …… Figure 3: Two examples in our Dual-Cluster Memory. N= 5), we trigger a knowledge update step: K(t+1) =LLM synth  K(t) ∪ N[ j=1 Φnj   (2) Here, LLMsynth is used to abstract generalized pat- terns from the new batch of instance knowledge Φnj and merge them into the generalized knowl- edge K(t). This ensures that K evolves to capture robust, non-redundant insights while retaining spe- cific pitfall warnings, and is not overly influenced by extreme samples, as shown in Figure 3. Bipartite Graph Construction.At the same time, we introduce a bipartite graph G to model the associations between the these decoupled clus- ters. Since each experience node n naturally maps to a pair of clusters (C M i , CC j ), these linkages ag- gregate into a global structure. We formalize this as a bipartite graph G= (V M , VC, E), where the edge weight wij quantifies the co-occurrence fre- quency of modeling logic CM i and coding strategy CC j . The strong edges represent proven pathways, providing critical priors for subsequent usage. 3.3 Memory-Augmented Inference 3.3.1 Dual-Retrieval For a new problemxnew, DCM-Agent leverages the memory to efficiently navigate the solution space by retrieving relevant historical experiences. We first encode the problem into the modeling logic embedding enew and employ two complementary retrieval mechanisms to balance the problem rele- vance with general algorithmic applicability: Instance-Level Retrievalcaptures the granular problem similarity by retrieving specific nodes H closest to enew, thereby identifying relevant experi- ence nodes that share detailed semantic features: H= arg max K {sim(enew,e i)|x i ∈ D}.(3) Cluster-Level Retrievaltargets abstract patterns by comparing enew directly with cluster ce"},{"citing_arxiv_id":"2605.16292","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evidence of a Cognitive Shift in AI Education: How Students Are Rethinking Human Intelligence?","primary_cat":"cs.CY","submitted_at":"2026-04-14T14:10:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Longitudinal poll data from 471 students in AI courses shows a shift toward preferring human intelligence, reaching 65% in technical courses and 90% in design courses by 2026.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.00799","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Multimodal Language Models Cannot Spot Spatial Inconsistencies","primary_cat":"cs.CV","submitted_at":"2026-04-01T12:06:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multimodal LLMs significantly underperform humans at spotting objects that break 3D consistency in multi-view image pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.03944","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SCP: Spatial Causal Prediction in Video","primary_cat":"cs.CV","submitted_at":"2026-03-04T11:09:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SCP defines a new benchmark task for predicting spatial causal outcomes beyond direct observation and shows that 23 leading models lag far behind humans on it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.23292","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agentic Physical AI toward a Domain-Specific Foundation Model for Nuclear Reactor Control","primary_cat":"cs.AI","submitted_at":"2025-12-29T08:26:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A compact language model trained on scaled synthetic nuclear reactor control data exhibits variance collapse and emergent concentration on a single actuation strategy driven by physical execution success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.18373","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MASS: Motion-Aware Spatial-Temporal Grounding for Physics Reasoning and Comprehension in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2025-11-23T09:43:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MASS adds spatiotemporal motion signals and 3D grounding to VLMs and releases MASS-Bench, yielding physics-reasoning performance within 2% of Gemini-2.5-Flash after reinforcement fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.10946","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Abstract 3D Perception for Spatial Intelligence in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2025-11-14T04:16:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SandboxVLM enhances VLMs' spatial intelligence by encoding 3D geometry with abstract bounding boxes in a four-stage zero-shot pipeline, yielding an 8.3% improvement on SAT Real benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}