{"total":587,"items":[{"citing_arxiv_id":"2606.25437","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LinStereo: Linear-Complexity Global Attention for Multi-Scale Iterative Stereo Matching","primary_cat":"cs.CV","submitted_at":"2026-06-24T05:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LinStereo uses Position-Aware Linear Attention, Hierarchical Semantic Cost Volumes, and Depth Prior Initialization to enable global aggregation in iterative stereo matching at linear complexity, showing improved performance on standard and underwater benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19088","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReSiReg: Towards Spatially Consistent Semantics in Language-Conditioned Robotic Tasks","primary_cat":"cs.RO","submitted_at":"2026-06-17T13:58:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReSiReg clusters VLM intermediates into prototypes, derives language descriptors, and reconstructs patches as mixtures to improve spatial consistency in dense language-grounded retrieval for robotics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18333","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Polarisation and Faraday rotation measure imaging at metre wavelengths with sub-arcsecond resolution: a foundational calibration strategy","primary_cat":"astro-ph.IM","submitted_at":"2026-06-16T18:00:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A calibration strategy using full-Jones corrections with an in-field unpolarised calibrator and visibility-based multi-epoch alignment enables sub-arcsecond polarimetric imaging with LOFAR at metre wavelengths.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12949","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ViPER: Vision-based Packing-Aware Encoder for Robust Malware Detection","primary_cat":"cs.CR","submitted_at":"2026-06-11T06:21:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ViPER uses a LoRA-adapted ViT-B/14 with dual heads for malware classification and packing detection plus a gating mechanism and weighted losses to reach 0.8521 balanced accuracy on 200k Windows PE images while detecting packing at 0.9949 AUC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08002","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Aqua Boundary-Saliency Attention Module for Lightweight Underwater Salient Instance Segmentation Detection Transformer","primary_cat":"cs.CV","submitted_at":"2026-06-06T06:43:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LUSIS-DETR with AquaBSAM reports leading performance on four underwater instance segmentation datasets and real-time FP16 inference on an NVIDIA T4 GPU.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01079","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Chameleon: Style-Content Disentangled Framework for Cross-Domain Object Compositing","primary_cat":"cs.CV","submitted_at":"2026-05-31T07:54:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chameleon proposes the first large-scale cross-domain compositing dataset and a disentangled encoder plus gated diffusion transformer that outperforms prior in-domain and cross-domain methods on plausibility and fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00784","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DINO-GFSA: Geo-Localization via Semantic Gated Fusion and Mamba-based Sequential Aggregation","primary_cat":"cs.CV","submitted_at":"2026-05-30T16:03:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"DINO-GFSA combines a LoRA-adapted DINOv3 backbone with semantic gated residual fusion and Mamba-based sequential aggregation to report state-of-the-art results on University-1652 and DenseUAV benchmarks, including a 3.48% Recall@1 gain on DenseUAV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00635","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Neural Losses Shape VAE Latents","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:20:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Neural reconstruction losses in VAEs reduce latent information content and produce more isotropic latent geometries with even uncertainty distribution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00548","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CAFOSat: A Strongly Annotated Dataset for Infrastructure-Aware CAFO Mapping Using High-Resolution Imagery","primary_cat":"cs.CV","submitted_at":"2026-05-30T05:47:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAFOSat is a new strongly annotated remote-sensing dataset for CAFO mapping that uses human-in-the-loop refinement and curated negatives, with benchmarks on CNNs, transformers, and vision-language models plus a synthetic augmentation pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00522","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Trajectory-Driven Spatio-Temporal Refinement Solution for CVPR 2026 8th UG2+ Challenge Track 3: DOST","primary_cat":"cs.CV","submitted_at":"2026-05-30T04:29:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Authors adapt the SegAnyMo baseline with DAVIS data plus simulated turbulence and a spatio-temporal cleanup module to rank 2nd on the DOST challenge track.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00386","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"{\\alpha}Depth: Learning Single-Pass Soft Boundary Decomposition for Stereo Conversion","primary_cat":"cs.CV","submitted_at":"2026-05-29T22:00:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"αDepth proposes a single-pass layered model with CAR for soft boundary decomposition to improve stereo conversion by estimating layered color and depth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03644","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spatial Transcriptomics-Guided Alignment Enhances Molecular Profiling in Pathology Foundation Model","primary_cat":"cs.LG","submitted_at":"2026-05-29T16:41:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"STAMP uses a curated 1.8M-pair spatial transcriptomics atlas and pathway-informed alignment to augment pathology foundation models for molecular phenotype inference from H&E WSIs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31513","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Personalize Your Large Vision-language Models With In-context Prompt Tuning","primary_cat":"cs.CV","submitted_at":"2026-05-29T16:31:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ICPT adds an adaptive-length projection module and two geometric regularizations to enable efficient, high-accuracy personalization of LVLMs across complex multi-concept tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31429","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YARD: Y-Architecture Register Decoding for Efficient Hallucination Mitigation in Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-29T15:23:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"YARD is a training-free method using Y-shaped decoder architecture and register tokens to improve contrastive decoding for hallucination reduction in LVLMs with lower latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30714","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vision-Based Localization in Dense Urban Environments: A Case Study of an Urban Village in China","primary_cat":"cs.CV","submitted_at":"2026-05-29T01:10:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Case study develops dual-camera image dataset from an urban village in China and benchmarks existing models for vision-based localization in GPS-poor dense settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30561","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLM3: Vision Language Models Are Native 3D Learners","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:48:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Standard VLMs achieve expert-level 3D performance on depth estimation, pose estimation, and object understanding via three simple techniques without architecture changes or regression losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30311","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Archon: A Unified Multimodal Model for Holistic Digital Human Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:53:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Archon unifies seven modalities via modality-specific tokenizers and an autoregressive backbone pretrained on 72 tasks, plus a 4x-efficient video reparameterization and stepwise 'Thinking in Modality' procedure, and reports superior or comparable results on digital-human tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30115","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Large Depth Completion Model from Sparse Observations","primary_cat":"cs.CV","submitted_at":"2026-05-28T15:50:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LDCM achieves state-of-the-art metric depth completion from sparse observations by combining foundation-model initialization with a point-map regression head that removes the need for camera intrinsics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30093","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geometry Matters: 3D Foundation Priors for Learning Semantic Correspondence","primary_cat":"cs.CV","submitted_at":"2026-05-28T15:37:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A 3D-aware framework uses SAM3D geometry and pose estimation plus geodesic filtering to supervise a lightweight adapter on DINO and Stable Diffusion features, improving semantic correspondence with less manual supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30060","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Consistent Video Geometry Estimation","primary_cat":"cs.CV","submitted_at":"2026-05-28T15:11:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ViGeo is a feed-forward transformer for video geometry that introduces dynamic chunking attention and a completion-based data refinement framework to achieve SOTA on depth, normals, and point map estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29997","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FRUC: Feedforward Dynamic Scene Reconstruction from Uncalibrated Collaborative Driving Views","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:27:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FRUC enables one-shot calibration-free dynamic scene reconstruction from collaborative driving views via a geometric Transformer, ego-centric occlusion priors, and zero-initialized residual denoising, claiming SOTA quality and speed on V2XReal and UrbanIng-V2X.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29980","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Genetically Aligned Patient Representations Improve Hematological Diagnosis","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:17:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A two-stage training method using self-supervised pretraining on cell images followed by contrastive alignment with genetic data creates improved patient encoders for hematological diagnosis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07590","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SlideCheck: Guiding Self-Supervised Pretraining of Pathology Foundation Models via Dataset Distributions","primary_cat":"cs.CV","submitted_at":"2026-05-28T13:05:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SlideCheck uses a dual-head MLP on frozen features plus MIL attention to score patches and filter pretraining subsets that approach full-data performance in self-supervised pathology ViT models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29827","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fairness Beyond Demographics: Optimizing Performance Across Appearance-Based Hidden Cohorts in Medical Imaging","primary_cat":"cs.CV","submitted_at":"2026-05-28T12:07:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LHCF trains medical image models for fairness by optimizing across latent appearance-based cohorts discovered via clustering, achieving SOTA results on single and multiple demographic attributes without using any demographic labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29691","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unsupervised Semantic Segmentation Facilitates Model Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-28T09:52:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A visualization protocol based on unsupervised semantic segmentation reveals positional biases, scaling behaviors, and boundary artifacts across self-supervised vision transformer models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29505","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ESAM++: Efficient Online 3D Perception on the Edge","primary_cat":"cs.CV","submitted_at":"2026-05-28T07:29:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ESAM++ introduces a 3D Sparse Feature Pyramid Network for efficient online 3D scene perception on edge devices, claiming competitive accuracy with up to 3x faster inference and 2x smaller model size than ESAM on four benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29335","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking FID Through the Geometry of the Reference Dataset","primary_cat":"cs.CV","submitted_at":"2026-05-28T04:10:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FID improves with better samples only on concentrated reference datasets but can worsen on dispersed ones, as shown by density and effective rank in a controlled study across six datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28572","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unsupervised Morphological Characterization of Gravitational-Wave Glitches in LIGO O4a Using Frozen DINOv2 Features","primary_cat":"astro-ph.IM","submitted_at":"2026-05-27T14:57:09+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Unsupervised DINOv2 embedding and DPMM clustering of 188,000+ O4a spectrograms finds all clusters map to known Gravity Spy classes with cosine similarity >0.98, yielding a null result on novel morphologies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23889","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HorizonStream: Long-Horizon Attention for Streaming 3D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:50:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HorizonStream is a long-horizon Transformer that factorizes geometric evidence influence into channel-wise linear attention for long-range temporal propagation and local spatiotemporal attention for short-range matching, claiming stable generalization from 48-frame training to over 10,000-frame test","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23699","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRONOS: Benchmarking Counterfactual Physical Consistency in Video Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T14:51:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CRONOS benchmark shows recent open-source video generators fail to preserve physical consistency under controlled changes to viewpoint, scene, object category, and appearance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23271","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvalVerse: Pipeline-Aware and Expert-Calibrated Benchmarking for Professional Cinematic Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T06:22:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EvalVerse is a pipeline-aware benchmark that distills expert cinematic judgments into VLMs to assess 'goodness' metrics like aesthetics and multi-shot coherence alongside basic prompt adherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23198","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Label-Efficient Dataset Pruning via Semi-Supervised Pseudo-Labeling","primary_cat":"cs.LG","submitted_at":"2026-05-22T03:29:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SemiPrune uses a small labeled subset and semi-supervised pseudo-labeling to enable supervised dataset pruning methods, achieving state-of-the-art results on domain-specific, image-corrupted, and long-tailed datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23098","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UfM*: Uncertainty from Motion* for DNN Depth Estimation Using Gaussians","primary_cat":"cs.RO","submitted_at":"2026-05-21T23:08:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UfM* uses Gaussian mixtures to compute multiview disagreement for uncertainty in depth estimation with single inference per image, reducing energy and memory use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23033","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Uncovering the Latent Potential of Deep Intermediate Representations","primary_cat":"cs.LG","submitted_at":"2026-05-21T20:58:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces LOES, a constructive spectral method to select task-discriminative subspaces from intermediate layer embeddings, and GeoReg for enforcing simplicial class geometry during fine-tuning, with reported gains increasing with model depth across modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22819","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cambrian-P: Pose-Grounded Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-21T17:59:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cambrian-P adds per-frame camera pose tokens and a regression head to video MLLMs, delivering 4.5-6.5% gains on spatial benchmarks, generalization to other video QA tasks, and SOTA streaming pose estimation on ScanNet.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Streaming methods that process frames incrementally include StreamVGGT [ 130], CUT3R [98], Point3R [103], Spann3R [95], and G2VLM [37]. 5.2. Results As shown in Tab. 5, Cambrian-Pachieves the minimal ATE on ScanNet [26] among streaming camera pose estimation models and delivers competitive performance on TUM [84], and Sintel [14], without relying on specialized designs like DINOv2 encoder [68] or bidirectional transformer [96, 98]. This highlights that standard MLLMs can predict accurate camera pose with only an additional pose head and two learnable pose queries. In addition, benefiting from the compact representation of the SigLIP encoder [ 93], the lower FLOPs of the causal transformer, and the optimized inference infrastructure of the LLM ecosystem,"},{"citing_arxiv_id":"2605.22607","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Gaze Reasoning in Vision Foundation Models for Gaze Following","primary_cat":"cs.CV","submitted_at":"2026-05-21T15:21:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A method combining head-conditioned local LoRA adaptation and out-of-cone penalty improves gaze reasoning in vision foundation models, yielding state-of-the-art results on GazeFollow and VAT datasets especially for non-salient targets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22467","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SADGE: Structure and Appearance Domain Gap Estimation of Synthetic and Real Data","primary_cat":"cs.CV","submitted_at":"2026-05-21T13:27:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SADGE is a new fused similarity metric combining DINOv3 appearance and MASt3R geometry via constrained bilinear interaction that correlates with downstream synthetic-to-real performance at Pearson r=0.88 across multiple benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22190","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"No Pose, No Problem in 4D: Feed-Forward Dynamic Gaussians from Unposed Multi-View Videos","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:57:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NoPo4D is the first feed-forward system for dynamic 4D Gaussian splatting from unposed multi-view videos, using velocity decomposition supervised by optical flow and a bidirectional motion encoder.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22139","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EventGait: Towards Robust Gait Recognition with Event Streams","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:12:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EventGait is a dual-stream spiking and cross-modal framework for event-based gait recognition that matches or exceeds RGB methods in normal conditions and significantly outperforms them in low light, supported by new synthetic event gait benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21906","ref_index":10,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Universal CT Representations from Anatomy to Disease Phenotype through Agglomerative Pretraining","primary_cat":"cs.CV","submitted_at":"2026-05-21T02:28:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FlexiCT provides CT foundation models via agglomerative pretraining on 266227 volumes from 56 datasets that match or exceed task-specific models on five task families while organizing embeddings along tumor-stage gradients.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21869","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Two-Stage Multimodal Framework for Emotion Mimicry Intensity Prediction","primary_cat":"cs.CV","submitted_at":"2026-05-21T01:29:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A staged multimodal fusion model for predicting six continuous emotion intensities from in-the-wild video achieves 0.4722 validation and 0.57 test Pearson correlation in the EMI challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21800","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"stable-worldmodel: A Platform for Reproducible World Modeling Research and Evaluation","primary_cat":"cs.LG","submitted_at":"2026-05-20T22:58:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper presents stable-worldmodel (swm), a platform with high-performance data layer, modern world model baselines, planning solvers, and extended environments for reproducible research and generalization evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21417","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ordering Matters: Rank-Aware Selective Fusion for Blended Emotion Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:12:55+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21324","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stimulus symmetries can confound representational similarity analyses","primary_cat":"q-bio.NC","submitted_at":"2026-05-20T15:51:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Stimulus symmetries render many neural representations functionally equivalent yet produce qualitatively different RSMs, including drifting ones from SGD or regularization in image-encoding networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21258","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Structural Latent Points for Efficient Visual Representations in Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-20T14:48:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A hybrid structural latent points representation is learned by inserting a point-wise latent VAE into a point-cloud autoencoder and regularizing toward a Gaussian prior, paired with a lightweight 3DGS rendering pipeline, yielding gains on RLBench and ManiSkill2 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21241","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Divide and Contrast: Learning Robust Temporal Features without Augmentation","primary_cat":"cs.LG","submitted_at":"2026-05-20T14:31:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Di-COT is an unsupervised contrastive method that stochastically partitions time-series windows into overlapping sub-blocks to learn representations without augmentation, reporting SOTA results on classification and transfer tasks across multiple benchmarks while cutting training time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21131","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniT: Unified Geometry Learning with Group Autoregressive Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:04:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniT unifies online and offline 3D geometry perception via a Group Autoregressive Transformer that processes observation groups with anchor-free point map prediction and a scale-adaptive loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20940","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"3D Reconstruction and Knowledge Distillation to Improve Multi-View Image Models to Explore Spike Volume Estimation in Wheat","primary_cat":"cs.CV","submitted_at":"2026-05-20T09:26:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Knowledge distillation from a rigid-invariant 3D point cloud network into a regulated multi-view Transformer yields lower-error, faster wheat spike volume estimates from 2D images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20822","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TERDNet: Transformer Encoder-Recurrent Decoder Network for Scene Change Detection","primary_cat":"cs.CV","submitted_at":"2026-05-20T07:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TERDNet introduces a transformer-encoder recurrent-decoder architecture for scene change detection that outperforms prior models on public benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20808","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spatial Gram Alignment for Ultra-High-Resolution Image Synthesis","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:59:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spatial Gram Alignment aligns internal self-similarities of LDM features with foundation priors to reconcile global structure and fine details in ultra-high-resolution text-to-image synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}