{"total":21,"items":[{"citing_arxiv_id":"2606.11689","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RankVR: Low-Rank Structure Perception and Value Recalibration for Robust Composed Image Retrieval","primary_cat":"cs.CV","submitted_at":"2026-06-10T06:06:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"RankVR introduces GSCP and ASVC modules to improve CIR robustness by decoupling clean samples via low-rank structure and dynamically scoring triplet value in noisy datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11257","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Energy-Efficient On-Device RAG on a Mobile NPU: System Design and Benchmark on Snapdragon X Elite","primary_cat":"cs.CL","submitted_at":"2026-06-09T01:09:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"First end-to-end RAG on mobile NPU delivers 18.1x faster prefilling, 4x lower latency and energy than CPU on Snapdragon X Elite with equivalent quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08144","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"IMAGINE: Adaptive Schema-Imagery Enhanced Composition for Composed Video Retrieval","primary_cat":"cs.CV","submitted_at":"2026-06-06T12:46:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"IMAGINE uses adaptive schema-imagery via dynamic multimodal prototypes to incorporate implicit semantics into composed video retrieval, claiming SOTA results on CVR and CIR benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09901","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"On the Controllability-Fidelity Frontier in Diffusion Editing","primary_cat":"cs.GR","submitted_at":"2026-06-05T13:24:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A study deriving mathematical formulations and bounds for diffusion editing objectives while empirically comparing methods on fidelity and control metrics and discussing ethical issues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04604","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"COMBINER: Composed Image Retrieval Guided by Attribute-based Neighbor Relations","primary_cat":"cs.CV","submitted_at":"2026-06-03T08:43:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COMBINER proposes a new architecture for composed image retrieval using adaptive semantic disentanglement, unified prototype-based composition, and dual attribute-based relation modeling to address visually similar but attribute-unrelated samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01113","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"R^3: Composed Video Retrieval via Reasoning-Guided Recalling and Re-ranking","primary_cat":"cs.CV","submitted_at":"2026-05-31T09:20:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"R^3 is a zero-shot pipeline that generates reasoning traces to augment composed video queries, fuses scores via agreement-gated residual, and re-ranks candidates for the CoVR-R challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00305","ref_index":88,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bridging Reasoning Trajectories in On-Policy Distillation via Near-Future Guidance","primary_cat":"cs.CL","submitted_at":"2026-05-29T19:32:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TOPD improves on-policy distillation for LLM reasoning by using near-future guidance to identify divergent states, raising average accuracy from 47.8% to 52.2% on math benchmarks including AIME24 and AIME25.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24500","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EgoAdapt: A Multi-Scene Egocentric Adaptation Method for CVPR 2026 HD-EPIC VQA Challenge","primary_cat":"cs.CV","submitted_at":"2026-05-23T10:16:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"EgoAdapt improves VQA on the HD-EPIC egocentric benchmark via category-conditioned routing, calibrated option scoring, and test-time consistency adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24496","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EgoAction: Egocentric Action Composition with Reliability-Aware Temporal Fusion for the EPIC-KITCHENS Action Detection Challenge at CVPR 2026","primary_cat":"cs.CV","submitted_at":"2026-05-23T10:05:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"EgoAction uses decoupled verb-noun temporal detectors on VideoMAE features and Dynamic Weighted Fusion of boundaries based on classification confidences for the EPIC-KITCHENS action detection challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24481","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniEgo-R$^2$: A Routed Reasoning Framework for the 1st Cross-Domain EgoCross Challenge at CVPR 2026","primary_cat":"cs.CV","submitted_at":"2026-05-23T09:09:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"OmniEgo-R² is a competition system that combines domain-specific VL models with temporal normalization, capability routing, and answer calibration to reach 66.35-66.77% accuracy on the EgoCross challenge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.24470","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TempRet: Temporal Enhancement and Two-Stage Reranking for CVPR 2026 EPIC-KITCHENS-100 Multi-Instance Retrieval Challenge","primary_cat":"cs.CV","submitted_at":"2026-05-23T08:37:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"TempRet enhances a CLIP dual-encoder with temporal modeling and two-stage reranking to report 67.97% mAP and 82.92% nDCG on the EK-100 MIR benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21190","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Semantic Granularity Navigation in Image Editing","primary_cat":"cs.CV","submitted_at":"2026-05-20T13:53:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"NaviEdit is a training-free inference-time controller that decouples edit progress from model scale traversal in diffusion-based image editing via self-consistency, reporting average gains across editors and backbones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16464","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MHMamba: Multi-Head Mamba for 3D Brain Tumor Segmentation","primary_cat":"cs.CV","submitted_at":"2026-05-15T10:05:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MHMamba combines a U-Net with multi-head Mamba, channel calibration, and adaptive skip fusion to improve 3D brain tumor segmentation accuracy and small-lesion sensitivity on BraTS datasets while retaining linear complexity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09253","ref_index":53,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Cornerstones or Stumbling Blocks? Deciphering the Rock Tokens in On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-10T01:41:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rock Tokens in on-policy distillation persist at high loss, account for up to 18% of outputs, absorb large gradient norms, but add negligible value to reasoning performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20358","ref_index":122,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ConeSep: Cone-based Robust Noise-Unlearning Compositional Network for Composed Image Retrieval","primary_cat":"cs.CV","submitted_at":"2026-04-22T08:59:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ConeSep tackles noisy triplet correspondences in composed image retrieval by introducing geometric fidelity quantization to locate noise, negative boundary learning for semantic opposites, and targeted unlearning via optimal transport, outperforming prior methods on FashionIQ and CIRR.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":", Diagonal Negative Composition, as follows, Fneg =P c(Q-F(Pneg,Φ I(xr),ΦT(xm))). (5) Subsequently, we optimize the construction of theDiag- onal Negative Compositionby focusing on moving it away from both the target feature and the composed feature. Target-oriented Learning.This aims to shapeF neg as the \"opposite\" ofF t. Inspired by reverse matching strategy of Sigmoid Loss [122], we define a binary target matrixT, whereT ij = 1(wheni=j) andT ij =−1(wheni̸=j). The objective is thats(F i neg,F j t)approximates the inverse target−T ij. This strategy ensures thatDiagonal Negative CompositionF neg movesaway fromits corresponding tar- get (i=j) andclose toall non-matching target images (i̸=j). Thus, it becomes the opposite of corresponding"},{"citing_arxiv_id":"2604.19386","ref_index":76,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Air-Know: Arbiter-Calibrated Knowledge-Internalizing Robust Network for Composed Image Retrieval","primary_cat":"cs.CV","submitted_at":"2026-04-21T12:10:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Air-Know decouples MLLM-based external arbitration from proxy learning via knowledge internalization and dual-stream training to overcome noisy triplet correspondence in composed image retrieval.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"and Stephen Gould. Image retrieval on real-life images with pre-trained vision-and-language models. InICCV, pages 2125-2134, 2021. 6, 2 [75] Zixu Li, Zhiheng Fu, Yupeng Hu, Zhiwei Chen, Haokun Wen, and Liqiang Nie. Finecir: Explicit parsing of fine- grained modification semantics for composed image re- trieval.arXiv preprint arXiv:2503.21309, 2025. 3 [76] Hua Wang and Fan Zhang. Computing nodes for plane data points by constructing cubic polynomial with constraints. CAGD, 111:102308, 2024. 3 [77] Zheng Liu, Honglin Lin, Chonghan Qin, Xiaoyang Wang, Xin Gao, Yu Li, Mengzhang Cai, Yun Zhu, Zhanping Zhong, Qizhi Pei, et al. Chartverse: Scaling chart reason- ing via reliable programmatic synthesis from scratch."},{"citing_arxiv_id":"2604.17898","ref_index":114,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReTrack: Evidence-Driven Dual-Stream Directional Anchor Calibration Network for Composed Video Retrieval","primary_cat":"cs.CV","submitted_at":"2026-04-20T07:17:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReTrack calibrates directional bias in composed video features using semantic disentanglement and bidirectional evidence alignment to improve retrieval performance on CVR and CIR tasks.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"34 88.70 95.23 SPRC [105] 49.18 72.43 55.64 73.89 59.35 78.58 51.96 82.12 89.74 97.69 80.6592.31 96.60 LIMN [112] 50.72 74.52 56.08 77.09 60.94 81.85 43.64 75.37 85.42 97.04 69.01 86.22 94.19 LIMN+ [112] 52.11 75.21 57.51 77.92 62.67 82.66 43.33 75.41 85.81 97.21 69.28 86.43 94.26 IUDC [113] 35.22 61.90 41.86 63.52 42.19 69.23 - - - - - - - ENCODER [114] 51.51 76.95 54.86 74.93 62.01 80.88 46.10 77.98 87.16 97.64 76.92 90.41 95.95 CVR Models CoVR [12] 44.55 69.03 48.43 67.42 52.60 74.31 49.69 78.60 86.77 94.31 75.01 88.12 93.16 CoVR Enrich [14] 46.12 69.52 49.61 68.88 53.79 74.74 51.03 - 88.93 97.53 76.51 - 95.76 CoVR-2 [13] 46.53 69.60 51.23 70.64 52.14 73.27 50.43 81.08 88.89 98.05 76.75 90.34 95."},{"citing_arxiv_id":"2604.10297","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FashionMV: Product-Level Composed Image Retrieval with Multi-View Fashion Data","primary_cat":"cs.CV","submitted_at":"2026-04-11T17:26:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"FashionMV introduces product-level multi-view CIR, a 127K-product dataset built via automated LMM pipeline, and a 0.8B ProCIR model that beats larger baselines on three fashion benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.27253","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mitigating Hallucination on Hallucination in RAG via Ensemble Voting","primary_cat":"cs.CL","submitted_at":"2026-03-28T12:07:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VOTE-RAG applies retrieval voting across diverse queries and response voting across independent generations to mitigate hallucination-on-hallucination in RAG, matching or exceeding complex baselines on six benchmarks with a parallelizable design.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.13671","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentIAD: Agentic Industrial Anomaly Detection via Adaptive Memory Augmentation","primary_cat":"cs.CV","submitted_at":"2025-12-15T18:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AgentIAD introduces an agentic VLM with Perceptive Zoomer, Web Searcher, and Comparative Retriever tools plus two-stage SFT-then-RL training, achieving 5.92% higher classification accuracy than prior SOTA on the MMAD benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.23668","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hermes: A Multi-Scale Spatial-Temporal Hypergraph Network for Stock Time Series Forecasting","primary_cat":"cs.LG","submitted_at":"2025-09-28T06:13:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Hermes is a multi-scale spatial-temporal hypergraph network that improves stock forecasting accuracy by capturing inter-industry lead-lag dependencies and fusing information across scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}