{"total":50,"items":[{"citing_arxiv_id":"2605.23237","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StereoGenBench: A Synthetic Multi-Camera Benchmark for Stereo Generation under Controlled Baseline Regimes","primary_cat":"cs.CV","submitted_at":"2026-05-22T05:10:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StereoGenBench is a new synthetic benchmark dataset featuring calibrated multi-baseline stereo pairs with dense metric depth, intrinsics, and poses from Unreal Engine renders for controlled evaluation of stereo generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23226","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MASQ: Accelerating Masked Diffusion via Stage-Wise Multi-Precision Quantization","primary_cat":"cs.AR","submitted_at":"2026-05-22T04:37:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MASQ claims up to 16.06x speedup and 4.18x energy gain over A100 for masked diffusion via stage-wise multi-precision quantization and specialized hardware units while preserving quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19969","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Your Neighbors Know: Leveraging Local Neighborhoods for Backdoor Detection in Decentralized Learning","primary_cat":"cs.LG","submitted_at":"2026-05-19T15:17:01+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19060","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LiFT: Lifted Inter-slice Feature Trajectories for 3D Image Generation from 2D Generators","primary_cat":"cs.CV","submitted_at":"2026-05-18T19:30:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiFT factorizes 3D medical volume synthesis into per-slice 2D generation and inter-slice trajectory learning, using a tri-planar drifting loss for unconditional coherence and a z-context mixer for paired translation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18366","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"7DT Insight: Variability in Young Stellar Objects","primary_cat":"astro-ph.SR","submitted_at":"2026-05-18T13:17:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Two-epoch medium-band photometry of 769 YSO candidates in Orion A identifies 110 variables (~14%), with best-fit templates dominated by cold and hot spot models over extinction or gray changes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17588","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MSIQ: Moment-based Scale-Invariant Quality Measure for Single Image Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-17T18:32:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSIQ is a scale-invariant, model-free quality metric for single image super-resolution using normalized central geometric moments for direct comparison of different-resolution images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17198","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MIRAGE: Robust multi-modal architectures translate fMRI-to-image models from vision to mental imagery","primary_cat":"q-bio.NC","submitted_at":"2026-05-16T23:53:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIRAGE achieves state-of-the-art mental image reconstruction from fMRI on the NSD-Imagery benchmark by using a linear backbone with multi-modal text and image features fed to a diffusion model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15895","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Layer Selection in Feature-Based Losses Affects Image Quality and Microstructural Consistency in Deep Learning Super-Resolution of Brain Diffusion MRI","primary_cat":"eess.IV","submitted_at":"2026-05-15T12:23:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Deeper VGG16 layers in feature losses for diffusion MRI super-resolution introduce persistent grid artifacts in images and anisotropy maps, whereas the shallowest layer preserves consistency with ground truth at high upsampling factors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14998","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning Developmental Scaffoldings to Guide Self-Organisation","primary_cat":"cs.AI","submitted_at":"2026-05-14T16:01:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Joint training of NCA rules and SIREN pre-patterns improves robustness, encoding capacity, and symmetry breaking compared to purely self-organizing models by offloading information to initial conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14135","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PanoPlane: Plane-Aware Panoramic Completion for Sparse-View Indoor 3D Gaussian Splatting","primary_cat":"cs.CV","submitted_at":"2026-05-13T21:39:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PanoPlane achieves up to 17.8% PSNR gains in sparse-view indoor novel view synthesis by using training-free plane-aware panoramic completion to supervise 3D Gaussian Splatting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12919","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GuardMarkGS: Unified Ownership Tracing and Edit Deterrence for 3D Gaussian Splatting","primary_cat":"cs.CV","submitted_at":"2026-05-13T02:48:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GuardMarkGS unifies watermarking and adversarial edit deterrence into a single optimization framework for protecting 3D Gaussian Splatting assets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11508","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LiBrA-Net: Lie-Algebraic Bilateral Affine Fields for Real-Time 4K Video Dehazing","primary_cat":"cs.CV","submitted_at":"2026-05-12T04:27:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiBrA-Net achieves real-time native 4K video dehazing via Lie-algebraic bilateral affine fields and releases the first 4K paired dehazing video benchmark with per-frame annotations.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"conditions. 5 Experiments 5.1 Experimental Setup Datasets and evaluation metrics.We evaluate on three paired video dehazing benchmarks. REV- IDE [50] is a real-world indoor dataset with 42 training and 6 testing videos in its official split. From the synthetic HazeWorld corpus [43] we retain three outdoor subsets-DA VIS [ 28], DDAD [12], and UA-DETRAC [39]-totaling 1,016 training and 584 testing videos. UHV-4K, introduced in §4, follows its standard split. Restoration quality is measured by PSNR [8], SSIM [38], and LPIPS [49]. Efficiency is reported as parameter count, GFLOPs, and FPS. Temporal consistency is evaluated with tOF [17] and within-videoσ-PSNR; all tOF scores use the RAFT [33] estimator."},{"citing_arxiv_id":"2605.11203","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FeatMap: Understanding image manipulation in the feature space and its implications for feature space geometry","primary_cat":"cs.LG","submitted_at":"2026-05-11T20:12:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Linear mappings in feature space can reconstruct a wide range of image manipulations including semantic edits, suggesting that feature representations are approximately linearly organized.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"These two architectures are chosen to represent two prominent and complementary design paradigms, convolutional and transformer- based, allowing us to assess whether findings generalize across architectural designs. Both models are strong, general-purpose backbones that have demonstrated state-of-the-art performance on a wide range of vision tasks [26]. Assessing the semantic retainmentRecent discussions in the field of self-supervised learning [ 27] emphasized that features suitable for reconstruction do not necessarily coincide with features suitable for semantic tasks. We therefore also assess the semantic quality of the mapped features as compared to the original features by means of a finetuned downstream classifier."},{"citing_arxiv_id":"2605.08288","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UMEDA: Unified Multi-modal Efficient Data Fusion for Privacy-Preserving Graph Federated Learning via Spectral-Gated Attention and Diffusion-Based Operator Alignment","primary_cat":"cs.LG","submitted_at":"2026-05-08T08:15:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UMEDA is a new graph federated learning method that uses low-rank spectral filtering and diffusion over a shared integral operator to fuse multi-modal data privately, outperforming baselines on MM-Fi and RELI11D under high heterogeneity and tight privacy budgets.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"but lackspectral filteringinductive biases-they treat all spectral components equally and cannot separate the shared low-rank semantic subspace (human pose) from modality-specific high-frequency artifacts (sensor noise), motivating an explicit spectral gate. Recent graph generative work further underscores the need to handle variable-size/discrete structures (continuous-time discrete-state graph diffusion [53], diffusion-free next-scale generation [7]). Federated learning under non-IID concept drift.Optimization-based FL such as FedProx [ 29] and SCAFFOLD [24] constrains local updates inweight spaceand struggles with the severe concept drift induced by environmental variation in sensing.Generative FLtrains or leverages diffusion models for server-side alignment [50, 16], but standard diffusion operates in fixed-dimensional Euclidean"},{"citing_arxiv_id":"2605.07192","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AsyncEvGS: Asynchronous Event-Assisted Gaussian Splatting for Handheld Motion-Blurred Scenes","primary_cat":"cs.CV","submitted_at":"2026-05-08T03:39:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AsyncEvGS reconstructs high-fidelity 3D scenes from motion-blurred images by first deblurring via event data then using VGGT-based pose estimation and structure-driven losses inside Gaussian Splatting.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"dence mapWfrom the cross-scale consistency ofS, where structurally consistent regions receive high confidence and textureless areas are down-weighted. Second, 8 J. Dai et al. we must account for small pose inaccuracies from our estimator, which can cause minor view shifts. Therefore, we compute our loss using the Structural Similarity (SSIM) index [42], which is inherently robust to small translations and focuses on structural correctness rather than unstable pixel-wise alignment. Our final event structure loss is defined as a weighted SSIM, computed as the expectation over all pixelsp: Lstruct = 1−E p h W(p)·SSIM S(Iren),S(Ievs) \u0001 (p) i , (2) whereI ren denotestheGaussian-renderedimage,I evs istheE2VID-reconstructed"},{"citing_arxiv_id":"2605.01543","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Physics-Guided Deep Learning For High Resolution X-ray Imaging","primary_cat":"eess.SP","submitted_at":"2026-05-02T17:15:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Physics-guided U-Net removes non-stationary artifacts from X-ray images, raising mean SSIM from 0.345 to 0.906 and 0.0679 to 0.945 in synthetic tests while preserving filament profiles better than Fourier filtering or DFFN.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01296","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SIFT-VTON: Geometric Correspondence Supervision on Cross-Attention for Virtual Try-On","primary_cat":"cs.CV","submitted_at":"2026-05-02T07:13:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIFT-VTON adds explicit geometric supervision from SIFT keypoints to diffusion-based virtual try-on to improve spatial alignment and detail preservation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00510","ref_index":94,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scale-Aware Adversarial Analysis: A Diagnostic for Generative AI in Multiscale Complex Systems","primary_cat":"cs.LG","submitted_at":"2026-05-01T08:36:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new scale-aware diagnostic framework shows that unconstrained diffusion generative models exhibit structural freezing and instability instead of smooth physical responses under multiscale perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00367","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Flow matching for Sentinel-2 super-resolution: implementation, application, and implications","primary_cat":"cs.CV","submitted_at":"2026-05-01T03:07:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Flow matching achieves single-step pixel accuracy and 20-step perceptual quality for Sentinel-2 super-resolution, outperforming diffusion and Real-ESRGAN while enabling large-scale 2.5 m land-cover products.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21801","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SyMTRS: Benchmark Multi-Task Synthetic Dataset for Depth, Domain Adaptation and Super-Resolution in Aerial Imagery","primary_cat":"cs.CV","submitted_at":"2026-04-23T15:59:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new large-scale synthetic multi-task benchmark dataset supplying pixel-perfect depth, domain-shifted night imagery, and multi-scale low-resolution pairs for aerial remote sensing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19512","ref_index":20,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Defining Robust Ultrasound Quality Metrics via an Ultrasound Foundation Model","primary_cat":"eess.IV","submitted_at":"2026-04-21T14:28:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes TinyUSFM-uLPIPS and TinyUSFM-NRQ metrics that show better alignment with segmentation task performance and expert preference than PSNR or VGG-LPIPS in ultrasound imaging.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19159","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MSDS: Deep Structural Similarity with Multiscale Representation","primary_cat":"cs.CV","submitted_at":"2026-04-21T07:13:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MSDS computes DeepSSIM at multiple pyramid scales and fuses the scores with learned weights, producing consistent improvements over single-scale DeepSSIM on IQA benchmarks with negligible extra cost.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"set of learnable global weights is then used to fuse the scale-wise similarity scores, yielding the final quality prediction. The loss function employs a standard MSE loss combined with a ranking loss. II. RELATEDWORK Early full-reference IQA methods rely on pixel-wise error estimation. Although computationally simple, these metrics correlate poorly with human subjective perception [3]. The Structural Similarity Index (SSIM) [4] proposed by Wang et al. assesses perceptual quality by modeling local image statistics, and its multi-scale extension, MS-SSIM [2], demonstrates that aggregating information across spatial scales improves alignment with human perception. Subsequently, methods [5]- [10] such as FSIM and GMSD further incorporate hand- crafted features and pre-defined fusion strategies."},{"citing_arxiv_id":"2604.18781","ref_index":288,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CAHAL: Clinically Applicable resolution enHAncement for Low-resolution MRI scans","primary_cat":"cs.CV","submitted_at":"2026-04-20T19:45:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAHAL introduces a physics-informed mixture-of-experts super-resolution network for clinical MRI that conditions on resolution and anisotropy and uses edge-penalised, Fourier, and segmentation-guided losses to reduce hallucinations compared with prior generative methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17390","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MESA: A Training-Free Multi-Exemplar Deep Framework for Restoring Ancient Inscription Textures","primary_cat":"cs.CV","submitted_at":"2026-04-19T11:38:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MESA restores ancient inscription textures via multi-exemplar style transfer from VGG19 features with per-layer exemplar selection and OCR-derived weights, without any model training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17208","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CDSA-Net:Collaborative Decoupling of Vascular Structure and Background for High-Fidelity Coronary Digital Subtraction Angiography","primary_cat":"cs.CV","submitted_at":"2026-04-19T02:35:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CDSA-Net decouples vascular structure extraction and background restoration in coronary DSA via hierarchical geometric priors and adaptive noise modeling to eliminate artifacts while preserving tissue fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16955","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Training-inference input alignment outweighs framework choice in longitudinal retinal image prediction","primary_cat":"cs.CV","submitted_at":"2026-04-18T10:28:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Training-inference input alignment outweighs framework choice for longitudinal retinal image prediction, with deterministic regression matching complex models when acquisition variability dominates disease progression.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"map SSIM (ΔSSIM), lesion-level structure via Dice and 95th percentile Hausdorff distance (HD95), and pixel-level fidelity via mean absolute error (MAE), peak signal-to-noise ratio (PSNR), and structural similarity index (SSIM). Each view captures a different aspect of prediction quality; we report metrics on cohorts where each is well-defined. Progression Dynamics We calculate the change-map SSIM [30] (𝛥SSIM) as the primary progression metric. A structural metric that quantifies whether a model captures the spatial pattern of temporal change. For each eye, we compute two difference images: 𝛿gt = 𝐼∗ − 𝐼𝑁 (ground-truth change) and 𝛿pred = 𝐼̂ − 𝐼𝑁 (predicted change), where 𝐼∗ is the ground-truth target, 𝐼̂ is the prediction, and 𝐼𝑁 is the most recent history frame, all in a normalized [0,1] intensity space."},{"citing_arxiv_id":"2604.14561","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CoCoDiff: Optimizing Collective Communications for Distributed Diffusion Transformer Inference Under Ulysses Sequence Parallelism","primary_cat":"cs.DC","submitted_at":"2026-04-16T02:43:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoCoDiff achieves 3.6x average and 8.4x peak speedup for distributed DiT inference on up to 96 GPU tiles via tile-aware all-to-all, V-first scheduling, and selective V communication.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"AsyncDiff [39] parallelizes denoising by running consecutive steps on different GPUs asynchronously. xDiT [31] provides a hybrid engine combining Ulysses, Ring, PipeFusion, and CFG parallelism. More recently, ScaleFu- sion [40] targets video DiTs by scheduling intra-layer and inter-layer communication to hide cross-machine overhead, achieving 3.6×strong scaling on 32 A100s. StreamFusion [41] designs a topology-aware serving engine using NVSHMEM to unify Torus, Ulysses, and Ring communication on NVIDIA GPUs. The above systems optimize parallelism strategies but treat the all-to-all as a black box. CoCoDiff is complementary: it restructures and reduces the all-to-all itself by exploiting QKV asymmetry and temporal redundancy, and is the first"},{"citing_arxiv_id":"2604.13863","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PostureObjectstitch: Anomaly Image Generation Considering Assembly Relationships in Industrial Scenarios","primary_cat":"cs.CV","submitted_at":"2026-04-15T13:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PostureObjectStitch generates assembly-aware anomaly images by decoupling multi-view features into high-frequency, texture and RGB components, modulating them temporally in a diffusion model, and applying conditional loss plus geometric priors to preserve correct component relationships.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"assessment: from error visibility to structural similarity.IEEE Transactions on Image Processing13, 4 (2004), 600-612. doi:10.1109/TIP.2003.819861 [41] Zhonghao Wang, Wei Wei, Yang Zhao, Zhisheng Xiao, Mark Hasegawa-Johnson, Humphrey Shi, and Tingbo Hou. 2023. HiFi Tuner: High-Fidelity Subject-Driven Fine-Tuning for Diffusion Models. arXiv:2312.00079 [cs.CV] https://arxiv.org/ abs/2312.00079 [42] Felix Wimbauer, Bichen Wu, Edgar Schoenfeld, Xiaoliang Dai, Ji Hou, Zijian He, Artsiom Sanakoyeu, Peizhao Zhang, Sam Tsai, Jonas Kohler, Christian Rupprecht, Daniel Cremers, Peter Vajda, and Jialiang Wang. 2024. Cache Me if You Can: Accelerating Diffusion Models through Block Caching. arXiv:2312.03209 [cs.CV] https://arxiv.org/abs/2312.03209 [43] Chendong Xiang, Fan Bao, Chongxuan Li, Hang Su, and Jun Zhu."},{"citing_arxiv_id":"2604.10789","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReplicateAnyScene: Zero-Shot Video-to-3D Composition via Textual-Visual-Spatial Alignment","primary_cat":"cs.CV","submitted_at":"2026-04-12T19:42:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReplicateAnyScene performs fully automated zero-shot video-to-compositional-3D reconstruction by cascading alignments of generic priors from vision foundation models across textual, visual, and spatial dimensions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"4: Qualitative comparison on the C3DR benchmark.We present represen- tative qualitative visualizations of the final reconstructed scenes alongside results from MetaScenes [70] and SimRecon [62]. T able 1: Quantitative comparisons on the C3DR benchmark.We evaluate against MetaScenes [70] and SimRecon [62] across textual completeness (Rec, F1), visual quality (PSNR, SSIM [57], LPIPS [73], MUSIQ [24]), and geometric accuracy (CD, F, NC). Our method outperforms all baselines across every evaluation dimension. Method Textual Visual Geometric Rec↑F1↑ PSNR↑SSIM↑LPIPS↓MUSIQ↑ CD↓F↑NC↑ MetaScenes 64.29 75.10 11.63 0.746 0.243 68.18 0.643 3.95 41.04 SimRecon - 80.88 15.77 0.828 0.206 69.49 0.778 6.03 46.32 Ours 91.43 90.86 19.45 0."},{"citing_arxiv_id":"2604.09999","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GIF: A Conditional Multimodal Generative Framework for IR Drop Imaging in Chip Layouts","primary_cat":"cs.CV","submitted_at":"2026-04-11T03:00:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GIF fuses geometrical image features and logical graph topology in a conditional diffusion model to generate high-quality IR drop images for chip layouts, outperforming prior ML methods on CircuitNet-N28 with SSIM 0.78, Pearson 0.95, PSNR 21.77, and NMAE 0.026.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09304","ref_index":3,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GeRM: A Generative Rendering Model From Physically Realistic to Photorealistic","primary_cat":"cs.CV","submitted_at":"2026-04-10T13:13:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GeRM learns a distribution transfer vector field via a multi-condition ControlNet to convert physically-based renders into photorealistic images using text prompts and a 50K expert-curated dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09233","ref_index":70,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A GPU-enhanced workflow for non-Fourier SENSE reconstruction","primary_cat":"eess.IV","submitted_at":"2026-04-10T11:41:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A public GPU workflow for non-Fourier SENSE MRI reconstruction with sensitivity and off-resonance mapping enables fast, accurate imaging from challenging spiral trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08781","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PSIRNet: Deep Learning-based Free-breathing Rapid Acquisition Late Enhancement Imaging","primary_cat":"eess.IV","submitted_at":"2026-04-09T21:31:48+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PSIRNet produces diagnostic-quality free-breathing PSIR LGE cardiac MRI from a single interleaved IR/PD acquisition over two heartbeats using a physics-guided deep learning network trained on over 800,000 slices.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08405","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SyncBreaker:Stage-Aware Multimodal Adversarial Attacks on Audio-Driven Talking Head Generation","primary_cat":"cs.CV","submitted_at":"2026-04-09T16:03:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SyncBreaker jointly attacks image and audio streams with Multi-Interval Sampling and Cross-Attention Fooling to degrade speech-driven talking head generation more than single-modality baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"liance on explicit 3D modeling. Hallo [50] improves generation qual- ity and stability through hierarchical audio-driven visual synthesis, and Hallo2 [6] further extends this line to long-duration and high- resolution scenarios. VASA-1 [51] emphasizes high naturalness and real-time performance. Loopy [20] focuses on modeling long-term motion dependencies. LetsTalk [53] employs a latent diffusion trans- former to model audio-conditioned video generation, while Fanta- syTalking [45] improves motion realism through a two-stage audio- visual alignment strategy and coherent motion synthesis. Sonic [19] emphasizes global audio perception and motion control, while Con- sistTalk [29] focuses on temporal consistency in diffusion-based"},{"citing_arxiv_id":"2604.07959","ref_index":35,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Seeing enough: non-reference perceptual resolution selection for power-efficient client-side rendering","primary_cat":"cs.GR","submitted_at":"2026-04-09T08:22:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A neural network trained on full-reference perceptual quality labels predicts minimal sufficient resolution for rendered video to enable power-efficient client-side rendering.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"signed to capture a wide spectrum of spatial and temporal distortions, such as aliasing, ghosting, and upscaling artifacts, all labeled with high-fidelity JOD scores. 2 Related work 2.1 Perceptual metrics for video and graphics Full-reference(FR)metrics.Accuratequantificationofperceivedvisualqual- ity remains a central goal in both imaging and graphics. Classical full-reference (FR) metrics such as PSNR and SSIM [35] measure signal fidelity but correlate poorly with human judgment. Perceptual models instead incorporate spatio- temporal characteristics of the human visual system (HVS), including contrast sensitivity, masking, and temporal integration. Examples include VDP [23], HDR-VDP-3 [22], FLIP [2], and the recent ColorVideoVDP [21], which extend these principles to dynamic, color, and HDR video."},{"citing_arxiv_id":"2604.05727","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Single-Stage Signal Attenuation Diffusion Model for Low-Light Image Enhancement and Denoising","primary_cat":"cs.CV","submitted_at":"2026-04-07T11:33:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SADM adds a signal attenuation coefficient to the diffusion forward process so that reverse denoising simultaneously recovers brightness and suppresses noise without extra stages or correction modules.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02787","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LumaFlux: Lifting 8-Bit Worlds to HDR Reality with Physically-Guided Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-03T06:54:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LumaFlux is a physically and perceptually guided diffusion transformer for SDR-to-HDR conversion that introduces PGA, PCM, and HDR Residual Coupler modules plus a new training corpus and benchmark, outperforming prior ITM methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.15608","ref_index":69,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Benchmarking quantum simulation with neutron-scattering experiments","primary_cat":"quant-ph","submitted_at":"2026-03-16T17:56:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A 50-qubit quantum processor produces dynamical structure factors for KCuF3 that quantitatively match neutron-scattering measurements of its spinon spectrum.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.17205","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Deeper detection limits in astronomical imaging using self-supervised spatiotemporal denoising","primary_cat":"astro-ph.IM","submitted_at":"2026-02-19T09:51:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ASTERIS, a self-supervised spatiotemporal denoising algorithm, improves astronomical detection limits by 1 magnitude at 90% completeness while identifying three times more redshift >9 galaxy candidates in JWST images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.13294","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VisPhyWorld: Probing Physical Reasoning via Code-Driven Video Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-02-09T05:46:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VisPhyWorld evaluates MLLMs' physical reasoning via executable code generation for video reconstruction, with VisPhyBench showing strong semantics but weak parameter inference and dynamics simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.15905","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SNIC: Synthesized Noisy Images using Calibration","primary_cat":"eess.IV","submitted_at":"2025-12-17T19:19:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A sensor-specific calibration pipeline using dark frames produces synthesized noisy RAW images that close 54-64% of the PSNR gap to real noise versus manufacturer profiles, accompanied by the open SNIC dataset of over 6600 paired images.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.05342","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Delta Rectified Flow Sampling for Text-to-Image Editing","primary_cat":"cs.CV","submitted_at":"2025-09-01T21:51:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DRFS is a new inversion-free editing technique for rectified flow models that models source-target velocity discrepancies and applies a time-dependent shift to improve fidelity and unify prior methods like DDS and FlowEdit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.05463","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Task complexity shapes internal representations and robustness in neural networks","primary_cat":"cs.LG","submitted_at":"2025-08-07T15:02:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Harder classification tasks produce neural representations whose accuracy collapses under binarization and shuffling while easier tasks remain robust, defining task complexity via the performance gap between full-precision and perturbed networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.03478","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PhotIQA: A photoacoustic image data set with image quality ratings","primary_cat":"eess.IV","submitted_at":"2025-07-04T11:06:54+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PhotIQA is a new public dataset of 1134 expert-rated photoacoustic images for benchmarking image quality assessment in medical imaging.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.17726","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Slot-MLLM: Object-Centric Visual Tokenization for Multimodal LLM","primary_cat":"cs.CV","submitted_at":"2025-05-23T10:43:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Slot-MLLM introduces a slot-attention-based object-centric visual tokenizer with Q-Former encoder, diffusion decoder, and residual vector quantization for improved local visual comprehension and generation in multimodal LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.13713","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SLAM&Render: A Benchmark for the Intersection Between Neural Rendering, Gaussian Splatting and SLAM","primary_cat":"cs.RO","submitted_at":"2025-04-18T14:28:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents SLAM&Render, a robot-recorded benchmark dataset with 40 multi-modal sequences for testing SLAM, novel view synthesis, and Gaussian Splatting under controlled variations in lighting, arrangements, and occlusions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.04983","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DINO-WM: World Models on Pre-trained Visual Features enable Zero-shot Planning","primary_cat":"cs.RO","submitted_at":"2024-11-07T18:54:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DINO-WM builds world models on pre-trained DINOv2 features to enable zero-shot planning from offline data without rewards or demonstrations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.12632","ref_index":82,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Cyclic 2.5D Perceptual Loss for Cross-Modal 3D Medical Image Synthesis: T1w MRI to Tau PET","primary_cat":"eess.IV","submitted_at":"2024-06-18T13:59:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Proposes a cyclic 2.5D perceptual loss with manufacturer SUVR standardization for T1w MRI to tau PET synthesis, reporting improved regional agreement on ADNI and SCAN cohorts across U-Net, UNETR, SwinUNETR, CycleGAN, and Pix2Pix.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.17090","ref_index":280,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels","primary_cat":"cs.CV","submitted_at":"2023-12-28T16:10:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Q-Align trains LMMs on discrete text-defined levels for visual scoring, achieving SOTA on IQA, IAA, and VQA while unifying the tasks in OneAlign.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1907.06291","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Measuring the Transferability of Adversarial Examples","primary_cat":"cs.LG","submitted_at":"2019-07-14T22:20:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Empirical measurement of adversarial example transferability between VGG and Inception model classes with methodological refinements to attack strength selection, perturbation clipping, and evaluation via SSIM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}