{"total":14,"items":[{"citing_arxiv_id":"2606.09150","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ultra Flash: Scaling Real-Time Streaming Video Generation to High Resolutions","primary_cat":"cs.CV","submitted_at":"2026-06-08T07:45:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Ultra Flash introduces a cascaded streaming super-resolution framework with specialized training, upsampling, and optimization to enable real-time high-resolution video generation from low-res diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03746","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Qwen-Image-Flash: Beyond Objective Design","primary_cat":"cs.CV","submitted_at":"2026-06-02T15:00:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Empirical analysis of data, guidance, and task mixture in few-step distillation of Qwen-Image-2.0 produces the Qwen-Image-Flash model with improved performance in unified generation and editing tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25659","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"StreamChar: Long-Horizon Streaming Character Audio-Video Generation with Decoupled Orchestration","primary_cat":"cs.CV","submitted_at":"2026-05-25T10:04:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StreamChar decouples LLM-based orchestration from DiT denoising to achieve real-time long-horizon streaming character audio-video generation with reduced drift and misalignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25378","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CollectionLoRA: Collecting 50 Effects in 1 LoRA via Multi-Teacher On-Policy Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-25T03:07:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-teacher distillation framework that packs 50 effect LoRAs and fast sampling into a single adapter while aiming to avoid concept interference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25347","ref_index":13,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ERNIE-Image Technical Report","primary_cat":"cs.CV","submitted_at":"2026-05-25T02:04:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper presents ERNIE-Image, an open-source 8B DiT text-to-image model claiming leading open-source performance and near-commercial results via specialized data construction and DPO alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21573","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lens: Rethinking Training Efficiency for Foundational Text-to-Image Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Lens is a 3.8B-parameter text-to-image model that reaches competitive or superior performance to >6B-parameter systems using 19.3% of the training compute of Z-Image through a densely captioned 800M dataset, multi-resolution batching, semantic VAE, strong language encoder, RL fine-tuning, and 4-step","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11596","ref_index":13,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HorizonDrive: Self-Corrective Autoregressive World Model for Long-horizon Driving Simulation","primary_cat":"cs.CV","submitted_at":"2026-05-12T06:22:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HorizonDrive is a new anti-drifting autoregressive training and distillation method that enables minute-scale stable driving video rollouts by making the teacher model rollout-capable via scheduled rollout recovery and teacher rollout DMD.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10730","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Qwen-Image-2.0 Technical Report","primary_cat":"cs.CV","submitted_at":"2026-05-11T15:34:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Qwen-Image-2.0 unifies high-fidelity image generation and precise editing by coupling Qwen3-VL with a Multimodal Diffusion Transformer, improving text rendering, photorealism, and complex prompt following over prior versions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06376","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Continuous-Time Distribution Matching for Few-Step Diffusion Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-07T14:56:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"CDM migrates distribution matching distillation to continuous time via dynamic random-length schedules and active off-trajectory latent alignment, yielding competitive few-step image fidelity on SD3 and Longcat-Image.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05204","ref_index":49,"ref_count":3,"confidence":0.9,"is_internal_anchor":false,"paper_title":"D-OPSD: On-Policy Self-Distillation for Continuously Tuning Step-Distilled Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-06T17:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"D-OPSD formulates supervised fine-tuning of step-distilled diffusion models as on-policy self-distillation by having the model act as both teacher (with multimodal context) and student (with text-only context) on its own roll-outs.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"sampling process typically requires numerous iterative denoising steps [88, 32, 48], lead- ing to substantial latency and computational cost in practice. To address this, researchers have developed various step-distillation techniques [56, 112, 58, 111, 9] that substantially reduce the number of function evaluations (NFEs). Furthermore, recent advances in distil- lation methodology [49, 38, 111, 12, 59] have enabled state-of-the-art open-source few-step diffusion models to surpass their multi-step predecessors not only in sampling efficiency but also in generated image quality. As a result, such few-step models are increasingly adopted in practical production settings. Despite these advances, how to continually finetune these models remains unclear."},{"citing_arxiv_id":"2604.19009","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Guiding Distribution Matching Distillation with Gradient-Based Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-21T02:57:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GDMD replaces raw-sample rewards with distillation-gradient rewards in RL-guided diffusion distillation, yielding 4-step models that surpass their multi-step teachers on GenEval and human preference metrics.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"an update direction that better aligns with the prior distribution of teachers [57]. 4 Linwei Dong et al. Fig.2: Samples from 4-NFE student model distilled through our methods. GDMD showcases outstanding image generation, delivering ultra-realistic visuals and profound concept understanding. The prompts used are provided in the Appendix. Since then, numerous follow-up works [2,17,26,45,57] have emerged, for exam- ple, DMD2 [57] integrates the strengths of both GANs and distribution matching approaches, leading to state-of-the-art performance even surpassing that of the teacher. DMDR [17] unifies DMD and reinforcement learning within a single framework, demonstrating that DMD-RL style joint training can also transcend the constraints of teacher models."},{"citing_arxiv_id":"2604.04018","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"1.x-Distill: Breaking the Diversity, Quality, and Efficiency Barrier in Distribution Matching Distillation","primary_cat":"cs.CV","submitted_at":"2026-04-05T08:30:35+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"1.x-Distill achieves better quality and diversity than prior few-step distillation methods at 1.67 and 1.74 effective NFEs on SD3 models with up to 33x speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.13669","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EchoTorrent: Towards Swift, Sustained, and Streaming Multi-Modal Video Generation","primary_cat":"cs.CV","submitted_at":"2026-02-14T08:32:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EchoTorrent combines multi-teacher distillation, adaptive CFG calibration, hybrid long-tail forcing, and VAE decoder refinement to enable few-pass autoregressive streaming video generation with improved temporal consistency and audio-lip sync.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.03139","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Diversity-Preserved Distribution Matching Distillation for Fast Visual Synthesis","primary_cat":"cs.CV","submitted_at":"2026-02-03T05:45:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DP-DMD preserves sample diversity in few-step image synthesis by applying a teacher-derived target-prediction objective to the first distillation step and standard DMD loss to the rest.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}