{"total":239,"items":[{"citing_arxiv_id":"2606.01481","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeGen-Bench: Benchmarking Safety in Image-Conditioned Text-to-Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-31T22:46:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SafeGen-Bench is a benchmark with 10 malicious categories that evaluates conditional T2V models on paired start frames and text prompts, finding unsafety scores up to 44.5 and 80% guardrail failure rate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00793","ref_index":88,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MBench: A Comprehensive Benchmark on Memory Capability for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-30T16:17:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MBench is a new benchmark that quantifies long-term memory in video world models via three hierarchical consistency dimensions evaluated on curated real videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00299","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Real2SAM2Real: Generative 3D Caches as Complementary Context for Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-29T19:28:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Real2SAM2Real uses 3D caches from lifting models as complementary context for video diffusion models to enable precise decoupled control over camera trajectories and multi-entity motions while maintaining spatiotemporal consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31595","ref_index":105,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Global Motion with Compact Gaussians for Feed-Forward 4D Reconstruction","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:57:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"C4G introduces compact timestamp-conditioned Gaussian query tokens that aggregate full temporal context to decode 3D Gaussians with timestamp-modulated positions for feed-forward 4D reconstruction from monocular video, plus a diffusion-based rendering module and extension to 4D feature fields.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31590","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TunerDiT: Training-free Progressive Steering of Diffusion Transformer for Multi-Event Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:56:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TunerDiT adds event-partitioned masking and cross-event prompt fusion to diffusion transformers for training-free multi-event video generation, with gains scaling by event count on a new Meve benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31336","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DecMem: Towards Minute-Long Consistent World Generation with Decoupled Memory","primary_cat":"cs.CV","submitted_at":"2026-05-29T14:17:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DecMem proposes a decoupled memory system using sparse global and anchored local components to enable consistent minute-long controllable video generation in world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30774","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CameraNoise: Enabling Faithful Camera Control in Video Diffusion through Geometry-Flow-Guided Noise Warping","primary_cat":"cs.CV","submitted_at":"2026-05-29T03:02:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CameraNoise embeds camera motion into the noise space of video diffusion via Geometry-guided Reprojection Flow and noise warping to achieve faithful trajectory control while preserving the diffusion prior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30519","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniMem: Scalable and Adaptive Memory Retrieval for Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T19:56:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniMem enables scalable long video generation via adaptive sparse KV retrieval that addresses local bias and union explosion while preserving explicit historical access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30431","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DTG-Restore: Training-Free Diffusion Refinement for Generative Video Super-Resolution","primary_cat":"cs.CV","submitted_at":"2026-05-28T18:00:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents Decoupled Time Guidance (DTG) for training-free generative video super-resolution by temporally decoupling conditional and unconditional diffusion signals.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30346","ref_index":124,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YoCausal: How Far is Video Generation from World Model? A Causality Perspective","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:59:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"YoCausal benchmark shows video diffusion models detect the arrow of time but lack genuine causal understanding relative to humans.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30268","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PhyGenHOI: Physically-Aware 4D Generation of Dynamic Human-Object Interactions","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:29:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PhyGenHOI couples a motion diffusion model for humans with material point method simulation for objects on 3D Gaussians, using attraction loss, contact re-simulation, and masked video-SDS to produce physically consistent dynamic interactions from text.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30263","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"minWM: A Full-Stack Open-Source Framework for Real-Time Interactive Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:27:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"minWM supplies an end-to-end pipeline that fine-tunes bidirectional T2V/TI2V models with camera control then distills them via Causal Forcing into few-step autoregressive generators for low-latency rollout.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30045","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenEraser: Generalizable Video Object Removal via Balanced Text-Mask Guidance and Decoupled Locator-Preserver","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:58:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenEraser proposes MC-MoE with bipartite text guidance, LD-CFG fusion, and a decoupled locator-preserver architecture for generalizable video object and effect removal, claiming 2.16 dB and 1.44 dB gains on ROSE and VOR-Eval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28811","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HarmoVid: Relightful Video Portrait Harmonization","primary_cat":"cs.CV","submitted_at":"2026-05-27T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HarmoVid trains a video diffusion model on deflickered paired data from real and synthetic videos using asymmetric alpha mask conditioning to produce temporally coherent relightful portrait harmonization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28230","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Proprio: Latent Self-Scoring and Inference-Time Refinement for Physically Plausible Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-27T09:44:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proprio uses flow residuals from latent perturbations in frozen video generators as a self-scoring signal for physical plausibility, yielding reported gains of 16.5% on Physics-IQ and 20.6% on VideoPhy2-hard.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27891","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SmartDirector: Keyframe-Conditioned Cinematic Video Generation with Narrative Pacing Control","primary_cat":"cs.CV","submitted_at":"2026-05-27T03:16:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SmartDirector generates cinematic videos via Director-Gen for low-res keyframe-conditioned output followed by Director-SR refinement using high-res keyframes, trained on curated movie sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23903","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geo-Align: Video Generation Alignment via Metric Geometry Reward","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Geo-Align applies RL with a perceptual reward derived from 3D camera trajectory estimation to improve controllability and fidelity in video generation without paired training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23891","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Smart-Insertion-V: Photorealistic Video Insertion via a Closed-Loop Feedback Dual-Stream Framework","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:54:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Smart-Insertion-V is a dual-stream closed-loop framework with Dual-World-View RoPE and a Decoupled Guidance Module that inserts reference objects into videos while achieving stylistic harmony despite domain gaps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23878","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LaMo: Self-Supervised Latent Motion Priors for Physical Realism in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:34:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaMo adds self-supervised latent motion priors via a motion drift loss during training and motion prior guidance during sampling to boost physical fidelity in video diffusion models like CogVideoX.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23699","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRONOS: Benchmarking Counterfactual Physical Consistency in Video Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T14:51:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CRONOS benchmark shows recent open-source video generators fail to preserve physical consistency under controlled changes to viewpoint, scene, object category, and appearance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23508","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DrawVideo: Generating Long Video from Storyboard Keyframe Sketches","primary_cat":"cs.GR","submitted_at":"2026-05-22T11:16:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DrawVideo is a sketch-guided framework that decomposes long videos into controllable shots using keyframe sketches, appearance prompts, and motion prompts, supported by a new SketchLongVideo dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23345","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCOPE: Simulating Cross-game Operations in Playable Environments for FPS World Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T08:06:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCOPE adds per-pixel action conditioning to pretrained video diffusion models and releases the CrossFPS multi-game dataset to support cross-game FPS world model simulation with zero-shot transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23245","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimInsert: Seamless Video Object Insertion via Regional Sparse Attention Fusion","primary_cat":"cs.CV","submitted_at":"2026-05-22T05:28:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SimInsert is a training-free video object insertion technique that decouples the task into single-frame editing and semantic motion description, using image-to-video diffusion models with non-invasive guidance to achieve spatio-temporal coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22996","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoMoGen: COntrollable MOtion Dynamics and Interactions with Mask-Guided Video GENeration","primary_cat":"cs.CV","submitted_at":"2026-05-21T19:51:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CoMoGen generates controllable interactive video from mask sequences and images by encoding masks into MMDiT via MaskAdapter and LoRA on motion layers, claiming SOTA motion fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22144","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Sentence, One Drama: Personalized Short-Form Drama Generation via Multi-Agent Systems","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:15:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A hierarchical multi-agent framework converts a single sentence into a short drama using debate-based scripting, 3D-grounded first frames for spatial consistency, and multi-stage reviewer loops.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22051","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EasyVFX: Frequency-Driven Decoupling for Resource-Efficient VFX Generation","primary_cat":"cs.CV","submitted_at":"2026-05-21T06:38:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EasyVFX decouples VFX generation via frequency-aware Mixture-of-Experts and test-time training to achieve realistic effects with limited resources.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22015","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ORBIS: Output-Guided Token Reduction with Distribution-Aware Matching for Video Diffusion Acceleration","primary_cat":"cs.CV","submitted_at":"2026-05-21T05:23:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ORBIS uses output-guided token reduction and DATM to achieve 2x higher token reduction than AsymRnR, with up to 4.5x speedup and 79.3% energy savings versus A100 GPU for video DiT models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22882","ref_index":65,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GEM-4D: Geometry-Enhanced Video World Models for Robot Manipulation","primary_cat":"cs.CV","submitted_at":"2026-05-20T21:36:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21489","ref_index":96,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Variance Reduction for Expectations with Diffusion Teachers","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:59:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CARV amortizes upstream diffusion teacher costs over noise resamples with timestep importance sampling and stratified-inverse-CDF sampling, delivering 2-3x effective compute gains in text-to-3D experiments and order-of-magnitude variance cuts in single-step distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21431","ref_index":94,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"iTryOn: Mastering Interactive Video Virtual Try-On with Spatial-Semantic Guidance","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:23:32+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20961","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Preserve, Reveal, Expand: Faithful 4D Video Editing with Region-Aware Conditioning","primary_cat":"cs.CV","submitted_at":"2026-05-20T09:47:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PREX decomposes target 4D video volumes into Preserve, Reveal, and Expand roles with a region-aware adapter on a frozen diffusion backbone, trained via proxy tasks, and introduces the PREBench benchmark to reduce region-structured editing failures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20708","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Cross-Layer Information Routing in Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-20T05:07:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19728","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Aero-World: Action-Conditioned Aerial Video Generation from Inertial Controls","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:02:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Aero-World adapts a pretrained latent diffusion transformer for action-conditioned aerial video generation by injecting inertial action tokens and using a frozen latent-space Physics Probe for inertial consistency supervision during LoRA finetuning, with a new AeroBench benchmark showing improved AA","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19242","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PhyWorld: Physics-Faithful World Model for Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-19T01:28:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PhyWorld improves temporal consistency and physical plausibility in video world models via flow matching fine-tuning followed by DPO on physics preference pairs, with reported gains on VBench and a custom physical-faithfulness benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18678","ref_index":141,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Lance: Unified Multimodal Modeling by Multi-Task Synergy","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lance presents a dual-stream mixture-of-experts model with modality-aware positional encoding and staged multi-task training that outperforms prior open-source unified models on image and video generation while keeping strong understanding performance.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"More recent video-focused frameworks, including Omni-Video [99], UniVideo [119], and TV2TV [36], move closer to genuinely unified video models by jointly addressing video understanding, generation, editing, or interleaved language-video modeling under a more integrated architecture. Meanwhile, several task-unified video editing frameworks, such as AnyV2V [52], VACE [47], UNIC [141], EditVerse [48], and FullDiT [49], expand the controllability of video generation, but typically do not aim for full understanding-generation unification within a single multimodal model. Overall, multi-task synergy for image-video unified multimodal modeling remains to be further explored. 3 Methodology The core idea of Lance is that broad multi-task learning can further unlock the potential of unified multimodal"},{"citing_arxiv_id":"2605.18365","ref_index":86,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoFlow: Enforcing Implicit Geometric Consistency in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T13:17:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoFlow adds a geometry-consistency reward based on rigid camera flow and object appearance preservation, integrated via reinforcement fine-tuning to improve geometric coherence in video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18346","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Focused Forcing: Content-Aware Per-Frame KV Selection for Efficient Autoregressive Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-18T12:58:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Focused Forcing is a training-free per-frame KV selection method that combines attention scores with diversity metrics and head-importance estimation to accelerate autoregressive video diffusion up to 1.48x while improving quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17912","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WorldArena 2.0: Extending Embodied World Model Benchmarking on Modality, Functionality and Platform","primary_cat":"cs.RO","submitted_at":"2026-05-18T06:18:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"WorldArena 2.0 extends embodied world model benchmarks to visuotactile perception, interactive policy training, and diverse real and simulated robotic platforms under a unified protocol.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17837","ref_index":166,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temporal Aware Pruning for Efficient Diffusion-based Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T04:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TAPE applies temporal-aware token pruning with smoothing, reselection, and timestep scheduling to speed up video diffusion models while preserving visual fidelity and coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17184","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Substantial, Decomposable, and Invisible: Visual Context Misalignment in Instructional Videos for Physical Tasks","primary_cat":"cs.HC","submitted_at":"2026-05-16T22:42:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fully aligned instructional videos for physical tasks yield 11.1% better completion quality and 15.5% faster times, with four decomposable visual attributes whose isolated misalignments degrade performance without users noticing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17019","ref_index":70,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StreamingEffect: Real-Time Human-Centric Video Effect Generation","primary_cat":"cs.CV","submitted_at":"2026-05-16T14:45:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StreamingEffect enables real-time 720p human-centric video effect generation on one GPU via teacher-student distillation, keyframe control, and a new 130K video dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16713","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoWorld-VLM: Geometry from World Models for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-15T23:52:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16003","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Echo-Forcing: A Scene Memory Framework for Interactive Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-15T14:33:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Echo-Forcing decouples stable anchors, compressed history, and recent dynamics in video diffusion KV caches using hierarchical memory, scene recall frames, and difference-aware decay to support interactive long video generation under bounded cache.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15980","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization","primary_cat":"cs.CV","submitted_at":"2026-05-15T14:13:39+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15964","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WorldVLN: Autoregressive World Action Model for Aerial Vision-Language Navigation","primary_cat":"cs.RO","submitted_at":"2026-05-15T13:55:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WorldVLN proposes the first autoregressive world action model for aerial vision-language navigation that predicts short-horizon latent world states, decodes them to waypoints in closed loop, and uses two-stage training with Action-aware GRPO to achieve over 12% success-rate gains on benchmarks plus零","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15824","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization","primary_cat":"cs.CV","submitted_at":"2026-05-15T10:25:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15199","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EntityBench: Towards Entity-Consistent Long-Range Multi-Shot Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:59:55+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EntityBench is a new benchmark with detailed per-shot entity schedules from real media, and the EntityMem baseline using persistent per-entity memory achieves the highest character fidelity with Cohen's d of +2.33.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15182","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Warp-as-History: Generalizable Camera-Controlled Video Generation from One Training Video","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Warp-as-History enables zero-shot camera trajectory following in frozen video models by supplying camera-warped pseudo-history, with single-video LoRA fine-tuning improving generalization to unseen videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15178","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SANA-WM is a 2.6B-parameter efficient world model that synthesizes minute-scale 720p videos with 6-DoF camera control, trained on 213K public clips in 15 days on 64 H100s and runnable on single GPUs at 36x higher throughput than prior open baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15256","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReactiveGWM: Steering NPC in Reactive Game World Models","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:52:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReactiveGWM introduces a decoupled diffusion architecture for player-NPC interactions that learns game-agnostic response logic for zero-shot strategy transfer across games.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}