{"total":55,"items":[{"citing_arxiv_id":"2605.23699","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRONOS: Benchmarking Counterfactual Physical Consistency in Video Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T14:51:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CRONOS benchmark shows recent open-source video generators fail to preserve physical consistency under controlled changes to viewpoint, scene, object category, and appearance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23458","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One-Forcing: Towards Stable One-Step Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T10:16:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"One-Forcing augments DMD with a GAN loss to enable stable one-step causal autoregressive video generation, reporting a VBench score of 83.76 as SOTA among one-step methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22718","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WorldKV: Efficient World Memory with World Retrieval and Compression","primary_cat":"cs.CV","submitted_at":"2026-05-21T16:55:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WorldKV enables persistent world memory in autoregressive video diffusion models by selectively retrieving and compressing KV-cache chunks, matching full-cache fidelity at roughly twice the throughput without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21072","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Q-ARVD: Quantizing Autoregressive Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:58:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Q-ARVD introduces final-quality-aware frame weighting and outlier-aware adaptive dual-scale quantization to enable accurate low-bit inference for autoregressive video diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21028","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DySink: Dynamic Frame Sinks for Autoregressive Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:01:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DySink uses adaptive retrieval of relevant historical frames plus a sink anomaly gate to improve dynamic degree and temporal quality in minute-long autoregressive video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19957","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World-Ego Modeling for Long-Horizon Evolution in Hybrid Embodied Tasks","primary_cat":"cs.CV","submitted_at":"2026-05-19T15:10:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes World-Ego Modeling with WEM using CP-MoE diffusion and a new HTEWorld benchmark, claiming SOTA on hybrid navigation-manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19398","ref_index":20,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rebalancing Reference Frame Dominance to Improve Motion in Image-to-Video Models","primary_cat":"cs.CV","submitted_at":"2026-05-19T05:50:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DyMoS rebalances self-attention from generated frames to the reference frame in initial denoising steps of image-to-video models to reduce reference dominance and improve motion without training or fidelity loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18739","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LongLive-2.0: An NVFP4 Parallel Infrastructure for Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:57:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongLive-2.0 delivers an NVFP4 parallel infrastructure that enables direct training of long multi-shot autoregressive diffusion video models and achieves up to 2.15x training and 1.84x inference speedups on Blackwell and other GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18733","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Advancing Narrative Long Video Generation via Training-Free Identity-Aware Memory","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:54:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IAMFlow is a training-free identity-aware memory system that tracks entities via LLM global ID assignment and VLM frame verification to reduce identity drift in narrative long video generation from shifting prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18346","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Focused Forcing: Content-Aware Per-Frame KV Selection for Efficient Autoregressive Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-18T12:58:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Focused Forcing is a training-free per-frame KV selection method that combines attention scores with diversity metrics and head-importance estimation to accelerate autoregressive video diffusion up to 1.48x while improving quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18233","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Train-Free Infinite-Frame Generation for Consistent Long Videos","primary_cat":"cs.CV","submitted_at":"2026-05-18T11:28:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIGA introduces two-stage alignment to close train-inference gaps and dual consistency enhancement via self-reflection and long-range guidance to achieve SOTA temporal consistency in infinite-frame video generation on VBench and NarrLV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17248","ref_index":185,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Image-to-Video Diffusion: From Foundations to Open Frontiers","primary_cat":"cs.CV","submitted_at":"2026-05-17T04:10:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey that organizes diffusion image-to-video methods into a taxonomy, distills core designs in condition encoding, temporal modeling, noise prior, and upsampling, and discusses applications plus challenges.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Scalable conditional generation (STIV[46]), Open large-scale generation (Wan[54]), Masked autoregressive (MarDini[48]) Fast autoregressive generation (CausVid[172]), Human animation (DreamActor-M1[168], OmniHuman[85], Hunyuan Video-Avatar[82]) Talking head generation (MagicInfinite[184]), Character animation (RCM[130]), Unified Inpainting (SkyReels-V4[149]) Long video generation (MAGI-1[185], EasyAnimate[102], SANA-Video[41]), Audio-driven talking portrait (FantasyTalking[104]) Open-source foundation generation (Open-Sora Plan[65]), Ultra-efficient generation (Reducio-DiT[53]), Controllable generation (FG[186]) Physically-coherent motion generation (Waver[170]), Boundary-pushing generation (Seedance[40]), Virtual try-on (DreamVVT[147])"},{"citing_arxiv_id":"2605.16649","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AtlasVid: Efficient Ultra-High-Resolution Long Video Generation via Decoupled Global-Local Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-15T21:39:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AtlasVid proposes a decoupled global-local diffusion framework that trains at low resolution with LoRA and generalizes to ultra-high-resolution long video synthesis via semantic proxy guidance and locality-preserving attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16003","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Echo-Forcing: A Scene Memory Framework for Interactive Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-15T14:33:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Echo-Forcing decouples stable anchors, compressed history, and recent dynamics in video diffusion KV caches using hierarchical memory, scene recall frames, and difference-aware decay to support interactive long video generation under bounded cache.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15178","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SANA-WM is a 2.6B-parameter efficient world model that synthesizes minute-scale 720p videos with 6-DoF camera control, trained on 213K public clips in 15 days on 64 H100s and runnable on single GPUs at 36x higher throughput than prior open baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14487","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","primary_cat":"cs.CV","submitted_at":"2026-05-14T07:27:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Head Forcing assigns tailored KV cache strategies to local, anchor, and memory attention heads plus head-wise RoPE re-encoding to extend autoregressive video generation from seconds to minutes without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14382","ref_index":28,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Delta Forcing: Trust Region Steering for Interactive Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-14T05:06:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Delta Forcing improves temporal coherence in interactive autoregressive video generation by estimating transition consistency from teacher-generator latent deltas and balancing it against a monotonic continuity objective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13724","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-13T16:06:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AnyFlow enables any-step video diffusion by distilling flow-map transitions over arbitrary time intervals with on-policy backward simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13111","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pyramid Forcing: Head-Aware Pyramid KV Cache Policy for High-Quality Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-13T07:23:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pyramid Forcing classifies attention heads into Anchor, Wave, and Veil types and applies type-specific KV cache policies to improve long-horizon autoregressive video generation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09681","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Forcing-KV: Hybrid KV Cache Compression for Efficient Autoregressive Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-10T17:59:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Forcing-KV applies head-specific static and dynamic pruning to KV caches in AR video diffusion models, achieving over 29 fps, 30% memory reduction, and up to 2.82x speedup at maintained quality.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"Chen, Y . Luet al., \"Longlive: Real-time Interactive Long Video Generation,\"ICLR, 2026. [3] T. Yin, Q. Zhang, R. Zhang, W. T. Freeman, F. Durand, E. Shechtman, and X. Huang, \"From Slow Bidirectional to Fast Autoregressive Video Diffusion Models,\" in2025 IEEE/CVF Confer- ence on Computer Vision and Pattern Recognition (CVPR). IEEE, 2025, pp. 22 963-22 974. [4] H. Teng, H. Jia, L. Sun, L. Li, M. Li, M. Tang, S. Han, T. Zhang, W. Zhang, W. Luoet al., \"Magi-1: Autoregressive video generation at scale,\"arXiv preprint arXiv:2505.13211, 2025. [5] G. Chen, D. Lin, J. Yang, C. Lin, J. Zhu, M. Fan, H. Zhang, S. Chen, Z. Chen, C. Maet al., \"Skyreels-v2: Infinite-length film generative model,\"arXiv preprint arXiv:2504."},{"citing_arxiv_id":"2605.07915","ref_index":79,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"What Matters for Diffusion-Friendly Latent Manifold? Prior-Aligned Autoencoders for Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prior-Aligned AutoEncoders shape latent manifolds with spatial coherence, local continuity, and global semantics to improve latent diffusion, achieving SOTA gFID 1.03 on ImageNet 256x256 with up to 13x faster convergence.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"PAE's improved fidelity-learnability balance? (Tab. 2(a), Fig. 6(b)(c)) • Q3: Ablation studies.Are the proposed design choices effective, and does PAE remain robust across different encoders and moderate design changes? (Tab. 2(b), Tab. 3, Fig. 8) Implementation Details.We consider multiple frozen representation encoders, including DINOv2- L [56], SigLIP2-SO400M [ 79], DINOv3-L [ 72], and MAE-L [ 29]. Unless otherwise specified, all ablations use DINOv2-L. By default, the latent size is 16×16×32, the Detail-aware Modulator (DAM) uses K=6 blocks, and the tokenizer is trained on ImageNet for 50 epochs with the joint objective in Eq. 2 and Eq. 6. For downstream class-conditional generation, we train LightningDiT-XL"},{"citing_arxiv_id":"2605.06509","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"FreeSpec: Training-Free Long Video Generation via Singular-Spectrum Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-07T16:21:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FreeSpec uses SVD-based spectral reconstruction to fuse global low-rank and local high-rank features, reducing content drift and preserving temporal dynamics in long video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06356","ref_index":1,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SwiftI2V: Efficient High-Resolution Image-to-Video Generation via Conditional Segment-wise Generation","primary_cat":"cs.CV","submitted_at":"2026-05-07T14:34:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SwiftI2V achieves comparable 2K I2V quality to end-to-end models on VBench-I2V while cutting GPU time by 202x through low-resolution motion planning followed by strongly image-conditioned segment-wise high-resolution synthesis.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"it avoids the prohibitive costs of end-to-end models. SwiftI2V achieves competitive quality while reducing GPU-time by over 200× compared to a full-sequence baseline, and lowers the hardware barrier to consumer-grade GPUs. This efficient segment-based paradigm offers a promising direction toward scalable, long-duration, and interactive generative video. 10 References [1] Sand. ai, Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, W. Q. Zhang, Weifeng Luo, Xiaoyang Kang, Yuchen Sun, Yue Cao, Yunpeng Huang, Yutong Lin, Yuxin Fang, Zewei Tao, Zheng Zhang, Zhongshu Wang, Zixun Liu, Dai Shi, Guoli Su, Hanwen Sun, Hong Pan, Jie Wang, Jiexin Sheng, Min Cui, Min Hu, Ming Yan,"},{"citing_arxiv_id":"2605.06051","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RealCam: Real-Time Novel-View Video Generation with Interactive Camera Control","primary_cat":"cs.CV","submitted_at":"2026-05-07T11:36:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RealCam is a causal autoregressive model for real-time camera-controlled video-to-video generation, using cross-frame in-context teacher distillation and loop-closed data augmentation to achieve high fidelity and consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04461","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Stream-T1: Test-Time Scaling for Streaming Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-06T03:40:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stream-T1 is a test-time scaling framework for streaming video generation using scaled noise propagation from history, reward pruning across short and long windows, and feedback-guided memory sinking to improve temporal consistency and visual quality.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"A general framework for inference-time scaling and steering of diffusion models.arXiv preprint arXiv:2501.06848, 2025. [35] Charlie Victor Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar. Scaling llm test-time compute optimally can be more effective than scaling parameters for reasoning. InThe Thirteenth International Conference on Learning Representations, 2025. 13 [36] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. Magi-1: Autoregressive video generation at scale.arXiv preprint arXiv:2505.13211, 2025. [37] Ang Wang, Baole Ai, Bin Wen, Chaojie Mao, Chen-Wei Xie, Di Chen, Feiwu Yu, Haiming Zhao, Jianxiao Yang, Jianyuan Zeng, et al. Wan: Open and advanced large-scale video generative models."},{"citing_arxiv_id":"2605.03849","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Stream-R1: Reliability-Perplexity Aware Reward Distillation for Streaming Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-05T15:15:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Stream-R1 improves distillation of autoregressive streaming video diffusion models by adaptively weighting supervision with a reward model at both rollout and per-pixel levels.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"enabling streaming, frame-by-frame synthesis that can in principle extend to arbitrary temporal horizons [3, 5, 7, 12, 19, 40]. Pyramidal-Flow [17] employs multi-scale flow matching to reduce the computational burden of long sequences; SkyReels-V2 [4] integrates diffusion forcing with structural planning for scalable synthesis; FAR [8] combines short- and long-term contexts via flexible positional encoding; and MAGI-1 [30] adopts chunk-wise prediction for scalable autoregressive generation. A complementary line of work accelerates inference through distillation. Distribution matching distillation (DMD) [38] compresses multi-step teacher inference into few-step student generation by minimizing their output distribution divergence. CausVid [20] extends this framework to causal video generation by reformulating bidirectional diffusion as autoregressive"},{"citing_arxiv_id":"2605.02134","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Video Generation with Predictive Latents","primary_cat":"cs.CV","submitted_at":"2026-05-04T01:30:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PV-VAE improves video latent spaces for generation by unifying reconstruction with future-frame prediction, reporting 52% faster convergence and 34.42 FVD gain over Wan2.2 VAE on UCF101.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[41] Luming Tang, Menglin Jia, Qianqian Wang, Cheng Perng Phoo, and Bharath Hariharan. Emergent correspondence from image diffusion.Advancesin Neural Information Processing Systems, 36:1363-1389, 2023. [42] Zachary Teed and Jia Deng. Raft: Recurrent all-pairs field transforms for optical flow. InEuropean conference on computer vision, pages 402-419. Springer, 2020. [43] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. Magi-1: Autoregressive video generation at scale.arXiv preprint arXiv:2505.13211, 2025. [44] Shengbang Tong, Boyang Zheng, Ziteng Wang, Bingda Tang, Nanye Ma, Ellis Brown, Jihan Yang, Rob Fergus, Yann LeCun, and Saining Xie."},{"citing_arxiv_id":"2605.01725","ref_index":33,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Motion-Aware Caching for Efficient Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-03T05:49:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MotionCache accelerates autoregressive video generation up to 6.28x by motion-weighted cache reuse based on inter-frame differences, with negligible quality loss on SkyReels-V2 and MAGI-1.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"features are subsequently scattered back to update the residual cacheRcache, while inactive tokens bypass computation by directly retrieving stored residuals for approximation. 6 Experiments 6.1 Experimental Setup Base Models.To evaluate the efficacy of our proposed method, we selected two representative diffusion models based on the autoregressive paradigm: MAGI-1-4.5B-distill [33] and SkyReels-V2-1.3B [6]. For MAGI-1, we generate videos at 720p resolution consisting of 7 chunks, where each chunk contains 24 frames at 24 FPS. For SkyReels-V2, the generation targets a resolution of 540p, producing videos composed of 2 chunks with 97 frames each at 24 FPS. Evaluation Metrics.Following established acceleration protocols such as FlowCache [1] and TeaCache [20], we"},{"citing_arxiv_id":"2604.25819","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Mutual Forcing: Dual-Mode Self-Evolution for Fast Autoregressive Audio-Video Character Generation","primary_cat":"cs.CV","submitted_at":"2026-04-28T16:28:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mutual Forcing trains a single native autoregressive audio-video model with mutually reinforcing few-step and multi-step modes via self-distillation to match 50-step baselines at 4-8 steps.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"IEEE/CVF Conference on Computer Vision and Pattern Recognition, pages 10219-10228, 2023. [38] Yang Song and Prafulla Dhariwal. Improved techniques for training consistency models.arXiv preprint arXiv:2310.14189, 2023. [39] Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. Roformer: enhanced transformer with rotary position embedding. arxiv.arXiv preprint arXiv:2104.09864, 2021. [40] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. Magi-1: Autoregressive video generation at scale.arXiv preprint arXiv:2505.13211, 2025. [41] Andros Tjandra, Yi-Chiao Wu, Baishan Guo, John Hoffman, Brian Ellis, Apoorv Vyas, Bowen Shi, Sanyuan Chen, Matt Le, Nick Zacharov, Carleigh Wood, Ann Lee, and Wei-Ning Hsu."},{"citing_arxiv_id":"2604.21221","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sparse Forcing: Native Trainable Sparse Attention for Real-time Autoregressive Diffusion Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-23T02:22:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparse Forcing adds a native trainable sparsity mechanism and PBSA kernel to autoregressive diffusion video models, yielding higher VBench scores and 1.1-1.27x speedups on 5s to 1min generations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18215","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Memorize When Needed: Decoupled Memory Control for Spatially Consistent Long-Horizon Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-20T13:00:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A decoupled memory branch with hybrid cues, cross-attention, and gating improves spatial consistency and data efficiency in long-horizon camera-trajectory video generation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Training-free approaches [32, 51,60] reschedule noise, rebalance temporal frequencies, or introduce sparse at- tention mechanisms to stretch pretrained models without additional learning. Diffusion Forcing [6] and History-Guidance [39] condition each denoising step on previously generated frames with decayed noise levels, scaled up by SkyReels- V2 [7] and Magi-1 [46]. Some works distill bidirectional diffusion models into causal generators [9,22,25,53], aiming to mitigate the error accumulation inher- ent in autoregressive rollouts and theoretically enable infinite-length generation. Alternatively, method in [18,59] augment pretrained models with memory mod- ules and generate long videos iteratively, achieving high visual quality."},{"citing_arxiv_id":"2604.13509","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DiT as Real-Time Rerenderer: Streaming Video Stylization with Autoregressive Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-04-15T05:52:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RTR-DiT distills a bidirectional DiT teacher into an autoregressive few-step model using Self Forcing and Distribution Matching Distillation, plus a reference-preserving KV cache, to enable stable real-time text- and reference-guided video stylization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09527","ref_index":100,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Envisioning the Future, One Step at a Time","primary_cat":"cs.CV","submitted_at":"2026-04-10T17:46:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An autoregressive diffusion model on sparse point trajectories predicts multi-modal future scene dynamics from single images with orders-of-magnitude faster sampling than dense video simulators while matching accuracy.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"leads to a desired outcome, in a fully zero-shot manner. In billiard terms: can it plan a shot? Unlike pure forward prediction compared to one observed future, this setting forces exploration of counterfactual futures - many possible Method Param Throughput (samples/min)↑ (a) OWM (b) PhysicsIQ [69] (c) Physion [11] BEST-5↓BEST-5MIN↓BEST-5↓BEST-5MIN↓BEST-5↓BEST-5MIN↓ MAGI-1 [100] 4.5B 0.303 0.037 0.066 0.126 0.169 0.061 0.081 Wan2.2 [112] 14B 0.141 0.039 DNF 0.116 DNF 0.069 DNF CogVideo-X 1.5 [118] 5B 0.051 0.051 DNF0.100DNF 0.063 DNF SkyReels V2 [24] 1.3B 0.304 0.058 0.068 0.128 0.137 0.069 0.084 SVD 1.1 [16] 1.5B 0.714 0.054 0.119 0.138 0.241 0.070 0.147 Myriad (Ours) 665M2200 0.029 0.0130.115 0.045 0.048 0.020 MyriadTrained on 3→2D Tracks665M 2200 0."},{"citing_arxiv_id":"2604.09415","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PhysInOne: Visual Physics Learning and Reasoning in One Suite","primary_cat":"cs.CV","submitted_at":"2026-04-10T15:27:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"PhysInOne is a new dataset of 2 million videos across 153,810 dynamic 3D scenes covering 71 physical phenomena, shown to improve AI performance on physics-aware video generation, prediction, property estimation, and motion transfer.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"ing of physical principles. This ability also serves as a key enabler for emerging applications in future-aware robot plan- ning and embodied AI [26]. To this end, we showcase this critical task on PhysInOne by training and evaluating a se- ries of recent models, including TiNeuV ox [31], DefGS [90], FreeGave [54], TRACE [53], ExtDM [98], and MAGI-1 [1]. 3) Physical Properties Estimation: Inferring physical properties from visual observations, a.k.a system identifi- Table 1. A comparison between PhysInOne and existing datasets that are relevant to learning physics from dynamic videos. PhysInOne provides massive 3D scenes with complex objects and backgrounds across diverse physical phenomena, surpassing all prior works."},{"citing_arxiv_id":"2604.08995","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Matrix-Game 3.0: Real-Time and Streaming Interactive World Model with Long-Horizon Memory","primary_cat":"cs.CV","submitted_at":"2026-04-10T06:00:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Matrix-Game 3.0 delivers 720p real-time video generation at 40 FPS with minute-scale memory consistency by combining residual self-correction training, camera-aware memory injection, and DMD-based autoregressive distillation on a 5B model.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"However, these approaches are primarily designed for offline generation and lack explicit modeling of actions and interaction. Moreover, their long-horizon consistency typically relies on implicit modeling mechanisms, making it difficult to maintain stable performance in extended sequence generation. In parallel, open-source models (e.g., Wan [41], Magi-1 [40], and LTX-2.3 [13]) aim to advance video generation research by improving openness and scalability. In particular, Wan [41] builds a DiT-based video generation system enhanced by large-scale data and training strategies; Magi-1 [40] adopts a chunk-wise autoregressive diffusion paradigm, decomposing videos into sequential segments to enable scalable long-horizon modeling;"},{"citing_arxiv_id":"2604.06966","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MAR-GRPO: Stabilized GRPO for AR-diffusion Hybrid Image Generation","primary_cat":"cs.CV","submitted_at":"2026-04-08T11:30:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAR-GRPO stabilizes GRPO for AR-diffusion hybrids via multi-trajectory expectation and uncertainty-based token selection, yielding better visual quality, stability, and spatial understanding than baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"2025. Nextstep-1: Toward autoregressive image generation with continuous tokens at scale.arXiv preprint arXiv:2508.10711(2025). [28] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. 2025. MAGI-1: Autoregressive Video Generation at Scale.arXiv preprint arXiv:2505.13211(2025). [29] Keyu Tian, Yi Jiang, Zehuan Yuan, Bingyue Peng, and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. Advances in neural information processing systems37 (2024), 84839-84865. Ma et al. [30] Junke Wang, Zhi Tian, Xun Wang, Xinyu Zhang, Weilin Huang, Zuxuan Wu, and Yu-Gang Jiang. 2025. Simplear: Pushing the frontier of autoregressive visual"},{"citing_arxiv_id":"2604.06939","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Grounded Forcing: Bridging Time-Independent Semantics and Proximal Dynamics in Autoregressive Video Synthesis","primary_cat":"cs.CV","submitted_at":"2026-04-08T11:03:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Grounded Forcing introduces dual memory caching, reference-based positional embeddings, and proximity-weighted recaching to bridge stable semantics with local dynamics, improving long-range consistency in autoregressive video synthesis.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"While non-autoregressive video synthe- sis [8,12,19,20,23,26] has achieved remarkable fidelity via Diffusion Transform- ers [18], their reliance on bidirectional attention precludes KV caching, resulting in redundant computation and prohibitive inference latency. Consequently, en- abling real-time streaming necessitates a paradigm shift toward autoregressive architectures [2,3,10,22,30,31]. Such causal frameworks inherently align with temporal progression, facilitating low-latency synthesis and dynamic interac- tion. However, extending generation from short clips to continuous, long-horizon streamsintroducesfundamentalinstabilitiesthatremainunresolved.Specifically, autoregressive video synthesis is hindered by three interconnected challenges:se-"},{"citing_arxiv_id":"2604.04859","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unified Vector Floorplan Generation via Markup Representation","primary_cat":"cs.CV","submitted_at":"2026-04-06T17:04:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A single transformer model using a new markup representation generates functional floorplans from diverse conditions and outperforms prior task-specific methods on the RPLAN dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03118","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Salt: Self-Consistent Distribution Matching with Cache-Aware Training for Fast Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-03T15:43:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Salt improves low-step video generation quality by adding endpoint-consistent regularization to distribution matching distillation and using cache-conditioned feature alignment for autoregressive models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ODE initialization before asymmetric DMD [42], leading to higher-quality real- time interactive generation; Reward Forcing [26] distills bidirectional video diffu- sion into a few step autoregressive student with rewarded distribution matching to improve efficient streaming generation. In addition, long-horizon AR genera- tion emphasize scalable causal designs and long-video interaction mechanisms: MAGI-1 [34] scales chunk-wise autoregressive denoising for strong temporal con- sistency and deployability, LongLive [39] targets real-time interactive long videos with mechanisms for stable prompt transitions and long-range consistency un- der causal decoding. On the other hand, training-free methods, such as Infinity- RoPE [40] modifies temporal encoding and KV-cache behaviors to unlock action-"},{"citing_arxiv_id":"2604.02979","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Not All Frames Deserve Full Computation: Accelerating Autoregressive Video Generation via Selective Computation and Predictive Extrapolation","primary_cat":"cs.CV","submitted_at":"2026-04-03T11:34:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SCOPE accelerates autoregressive video diffusion up to 4.73x by using a tri-modal cache-predict-recompute scheduler with Taylor extrapolation and selective active-frame computation while preserving output quality.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"chronous video generation with auto-regressive diffusion. InProceedings of the Computer Vision and Pattern Recognition Conference. 7364-7373. [47] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. 2025. Magi-1: Autoregressive video generation at scale.arXiv preprint arXiv:2505.13211(2025). [48] Keyu Tian, Yi Jiang, Zehuan Yuan, Bingyue Peng, and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. Advances in neural information processing systems37, 84839-84865. [49] Alexander Tong, Kilian Fatras, Nikolay Malkin, Guillaume Huguet, Yanlei Zhang, Jarrid Rector-Brooks, Guy Wolf, and Yoshua Bengio."},{"citing_arxiv_id":"2603.21210","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pretrained Video Models as Differentiable Physics Simulators for Urban Wind Flows","primary_cat":"cs.LG","submitted_at":"2026-03-22T13:08:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WinDiNet repurposes a 2B-parameter video diffusion model as a differentiable surrogate that generates 112-frame urban wind flow rollouts in under one second and enables direct gradient optimization of building positions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.23058","ref_index":73,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoWorld: Geometric World Models","primary_cat":"cs.CV","submitted_at":"2026-02-26T14:42:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoWorld applies hyperbolic geometry to JEPA world models and introduces geometric reinforcement learning, reporting modest success-rate gains of ~3% and ~2% on 3- and 4-step planning tasks versus V-JEPA 2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15922","ref_index":78,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"World Action Models are Zero-shot Policies","primary_cat":"cs.RO","submitted_at":"2026-02-17T15:04:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DreamZero uses a 14B video diffusion model as a World Action Model to achieve over 2x better zero-shot generalization on real robots than state-of-the-art VLAs, real-time 7Hz closed-loop control, and cross-embodiment transfer with 10-30 minutes of data.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"com/blog/preview-uqlxvb-bb.html. 16 [76] Team Wan. Wan: Open and advanced large-scale video generative models. 2025. 2, 7, 11 [77] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. Magi-1: Autoregressive video generation at scale.arXiv preprint arXiv:2505.13211, 2025. 7, 8 [78] Homer Walke, Kevin Black, Abraham Lee, Moo Jin Kim, Max Du, Chongyi Zheng, Tony Zhao, Philippe Hansen-Estruch, Quan Vuong, Andre He, Vivek Myers, Kuan Fang, Chelsea Finn, and Sergey Levine. Bridgedata v2: A dataset for robot learning at scale. InConference on Robot Learning (CoRL), 2023. 10 [79] John Won, Kyungmin Lee, Huiwon Jang, Dongyoung Kim, and Jinwoo Shin."},{"citing_arxiv_id":"2602.13669","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EchoTorrent: Towards Swift, Sustained, and Streaming Multi-Modal Video Generation","primary_cat":"cs.CV","submitted_at":"2026-02-14T08:32:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"EchoTorrent combines multi-teacher distillation, adaptive CFG calibration, hybrid long-tail forcing, and VAE decoder refinement to enable few-pass autoregressive streaming video generation with improved temporal consistency and audio-lip sync.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.07775","ref_index":86,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rolling Sink: Bridging Limited-Horizon Training and Open-Ended Testing in Autoregressive Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-02-08T02:16:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Rolling Sink is a training-free cache adjustment technique that maintains visual consistency in autoregressive video diffusion models for ultra-long open-ended generation beyond training horizons.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"In contrast, AR models [1,10,75,87-89] can in principle, infinitely predict next- state conditioned on prior ones. To marry the best of both paradigms, a rapidly growing number of AR video diffusion models [11,12,16,18,25-27,37,38,42,48,55, 62,63,66,72,74,77,84,93-95,97,98,101,102,106,107,109,111,114] have emerged. Earlier methods, e.g., NOVA [17], SkyReels-V2 [13], and MAGI-1 [86] still rely on inefficient multi-step denoisingin eachAR generation step. Recently, Pyramid Flow [45] and CausVid [103-105] adopt few-step generation, making AR video generationtemporallyefficient. However, as the cached history grows longer, the demand of computational resources grows dramatically, which significantly con- stricts their generation length."},{"citing_arxiv_id":"2602.04939","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SynthForensics: Benchmarking and Evaluating People-Centric Synthetic Video Deepfakes","primary_cat":"cs.CV","submitted_at":"2026-02-04T16:47:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SynthForensics is a people-centric benchmark where face-based detectors lose 13-55 AUC points on modern synthetic videos compared to legacy manipulation sets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.02214","ref_index":37,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Causal Forcing: Autoregressive Diffusion Distillation Done Right for High-Quality Real-Time Interactive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-02-02T15:19:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Causal Forcing uses an autoregressive teacher for ODE initialization in diffusion distillation to close the causal attention gap and deliver better real-time video generation than Self Forcing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.20540","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Advancing Open-source World Models","primary_cat":"cs.CV","submitted_at":"2026-01-28T12:37:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LingBot-World is presented as an open-source world model that delivers high-fidelity simulation, minute-level contextual consistency, and real-time interactivity under one second latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.04678","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reward Forcing: Efficient Streaming Video Generation with Rewarded Distribution Matching Distillation","primary_cat":"cs.CV","submitted_at":"2025-12-04T11:12:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reward Forcing combines EMA-Sink tokens and Rewarded Distribution Matching Distillation to deliver state-of-the-art streaming video generation at 23.1 FPS without copying initial frames.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20714","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Inferix: A Block-Diffusion based Next-Generation Inference Engine for World Simulation","primary_cat":"cs.CV","submitted_at":"2025-11-25T01:45:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Inferix provides an optimized inference engine for semi-autoregressive block-diffusion decoding to support high-quality, variable-length video generation in world simulation applications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}