{"total":17,"items":[{"citing_arxiv_id":"2606.29198","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DTI: Dynamic Trajectory Initialization for Generative Face Video Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-06-28T04:55:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DTI reformulates generative face video super-resolution as directional restoration using enhancement-and-injection conditioning and an SNR-aligned discriminative guide for dynamic sampling initialization, claiming SOTA performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28677","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SATB-VR: Training Few-Step Video Restoration Diffusion Model using SNR-Aware Trajectory Blending","primary_cat":"cs.CV","submitted_at":"2026-06-27T01:32:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SATB-VR trains few-step video restoration diffusion models via SNR-aware trajectory blending of predictor outputs with ground-truth and a denoiser-driven consistency loss to achieve favorable performance on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27891","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SmartDirector: Keyframe-Conditioned Cinematic Video Generation with Narrative Pacing Control","primary_cat":"cs.CV","submitted_at":"2026-05-27T03:16:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SmartDirector generates cinematic videos via Director-Gen for low-res keyframe-conditioned output followed by Director-SR refinement using high-res keyframes, trained on curated movie sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25801","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PixelWizard: Towards Efficient High-Fidelity Video Generation at Ultra-Large Spatial Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-25T12:50:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PixelWizard decouples global structure from fine details via a spatiotemporal anchor and introduces Noise-Span Aligned Shortcut Training with biased sampling to achieve over 10x faster sampling for high-fidelity 2K/4K video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16649","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AtlasVid: Efficient Ultra-High-Resolution Long Video Generation via Decoupled Global-Local Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-15T21:39:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AtlasVid proposes a decoupled global-local diffusion framework that trains at low resolution with LoRA and generalizes to ultra-high-resolution long video synthesis via semantic proxy guidance and locality-preserving attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15824","ref_index":30,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization","primary_cat":"cs.CV","submitted_at":"2026-05-15T10:25:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FashionChameleon achieves interactive multi-garment video customization at 23.8 FPS via in-context teacher models, streaming distillation, and training-free KV cache rescheduling while using only single-garment data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13182","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DiffST: Spatiotemporal-Aware Diffusion for Real-World Space-Time Video Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-13T08:41:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiffST delivers state-of-the-art real-world space-time video super-resolution with 17x faster inference than prior diffusion methods by using one-step sampling, cross-frame context aggregation, and video representation guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23508","ref_index":81,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BurstGP: Enhancing Raw Burst Image Super Resolution with Generative Priors","primary_cat":"cs.CV","submitted_at":"2026-04-26T03:06:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BurstGP enhances raw burst image super-resolution by integrating pretrained video diffusion priors through a multiframe-aware model, degradation-aware conditioning, and color-space conversion, outperforming prior methods on perceptual metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18047","ref_index":175,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GS-STVSR: Ultra-Efficient Continuous Spatio-Temporal Video Super-Resolution via 2D Gaussian Splatting","primary_cat":"cs.CV","submitted_at":"2026-04-20T10:11:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GS-STVSR achieves state-of-the-art continuous spatio-temporal video super-resolution quality with nearly constant inference time at standard scales and over 3x speedup at extreme scales using 2D Gaussian Splatting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15911","ref_index":204,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Efficient Video Diffusion Models: Advancements and Challenges","primary_cat":"cs.CV","submitted_at":"2026-04-17T10:11:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A survey that groups efficient video diffusion methods into four paradigms—step distillation, efficient attention, model compression, and cache/trajectory optimization—and outlines open challenges for practical use.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14560","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DVFace: Spatio-Temporal Dual-Prior Diffusion for Video Face Restoration","primary_cat":"cs.CV","submitted_at":"2026-04-16T02:42:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DVFace uses a spatio-temporal dual-codebook and asymmetric fusion in a one-step diffusion model to deliver better video face restoration quality, temporal consistency, and identity preservation than recent methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10578","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rein3D: Reinforced 3D Indoor Scene Generation with Panoramic Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-12T10:55:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rein3D generates photorealistic, globally consistent 3D indoor scenes by using a restore-and-refine process where radial panoramic videos are restored via diffusion models and then used to update a 3D Gaussian field.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"W(ϕ) =λ+ (1−λ) cos(ϕ), (9) whereλis a hyper-parameter. This formulation lowers the over-sampled polar regions to match their actual geometric importance, while the constantλensures that the model still maintains valid gradients to learn the structure at the poles. 4.4 3D Scene Refinement To enhance visual quality of videos, we first employ FlashVSR [67] to upsample the video sequences, effectively recovering high-frequency details. These high- resolution panoramic frames are then projected into perspective views to serve as pseudo-ground truths. Finally, we fine-tune the 3D Gaussian Splatting against these views. During this optimization, we incorporate robust densification strate- gies and anti-aliasing techniques to effectively suppress geometric artifacts and"},{"citing_arxiv_id":"2604.10551","ref_index":82,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NTIRE 2026 Challenge on Short-form UGC Video Restoration in the Wild with Generative Models: Datasets, Methods and Results","primary_cat":"cs.CV","submitted_at":"2026-04-12T09:43:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The NTIRE 2026 challenge releases the KwaiVIR benchmark for short-form UGC video restoration and reports strong results from 12 teams using generative models on both subjective and objective tracks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06161","ref_index":114,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DiffHDR: Re-Exposing LDR Videos with Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-04-07T17:56:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiffHDR converts LDR videos to HDR by formulating the task as generative radiance inpainting in a video diffusion model's latent space, using Log-Gamma encoding and synthesized training data to achieve better fidelity and stability than prior methods.","context_count":1,"top_context_role":"other","top_context_polarity":"use_dataset","context_text":"Table 3:Quantitative comparison on in-the-wild and Veo2 video datasets. In-the-wild Video Dataset Veo2 Video Dataset Method DOVER↑MUSIQ↑CLIPIQA↑DOVER↑MUSIQ↑CLIPIQA↑ SingleHDR 0.71 53.21 0.46 0.59 43.78 0.30 LEDiff 0.61 53.68 0.42 0.53 41.41 0.29 Ours 0.74 55.79 0.48 0.61 46.06 0.34 DOVER[91],CLIPIQA[87],andMUSIQ[51]asnon-referenceperceptualquality metrics, following FlashVSR [114]. These metrics effectively evaluate the spatial and temporal quality of the reconstructed HDR. We compare DiffHDR against state-of-the-art LDR-to-HDR methods. In ad- dition, we conduct comprehensive ablation studies to validate the effectiveness of each proposed component and demonstrate further applications of our frame- work. Additional results are provided in the supplementary material."},{"citing_arxiv_id":"2604.02787","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LumaFlux: Lifting 8-Bit Worlds to HDR Reality with Physically-Guided Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-03T06:54:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LumaFlux is a physically and perceptually guided diffusion transformer for SDR-to-HDR conversion that introduces PGA, PCM, and HDR Residual Coupler modules plus a new training corpus and benchmark, outperforming prior ITM methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.20308","ref_index":72,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Taming Real-World Space-Time Video Super-Resolution with One-Step Diffusion","primary_cat":"cs.CV","submitted_at":"2026-01-28T06:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OSDEnhancer delivers state-of-the-art real-world space-time video super-resolution via one-step diffusion with temporal coherence and texture enrichment LoRAs plus a deformable recurrent VAE decoder.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.23709","ref_index":108,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stream-DiffVSR: Low-Latency Streamable Video Super-Resolution via Auto-Regressive Diffusion","primary_cat":"cs.CV","submitted_at":"2025-12-29T18:59:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Stream-DiffVSR enables practical low-latency video super-resolution by combining a four-step distilled denoiser, auto-regressive temporal guidance, and a temporal processor in a strictly causal pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}