{"total":260,"items":[{"citing_arxiv_id":"2606.01027","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$\\tau_0$-WM: A Unified Video-Action World Model for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-31T05:35:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A shared video diffusion backbone jointly predicts future latents and continuous actions while also rolling out candidate actions to predict dense task-progress scores, trained on 27,300 hours of mixed robot and human data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00793","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MBench: A Comprehensive Benchmark on Memory Capability for Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-30T16:17:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MBench is a new benchmark that quantifies long-term memory in video world models via three hierarchical consistency dimensions evaluated on curated real videos.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00499","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OptiWorld: Optimal Control for Video World Generation under Physical Constraints","primary_cat":"cs.CV","submitted_at":"2026-05-30T03:13:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OptiWorld inserts a classical optimal-control layer that extracts a world state, plans an optimal trajectory on a geometric manifold under physical constraints, and renders the video conditioned on that trajectory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00299","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Real2SAM2Real: Generative 3D Caches as Complementary Context for Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-29T19:28:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Real2SAM2Real uses 3D caches from lifting models as complementary context for video diffusion models to enable precise decoupled control over camera trajectories and multi-entity motions while maintaining spatiotemporal consistency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31336","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DecMem: Towards Minute-Long Consistent World Generation with Decoupled Memory","primary_cat":"cs.CV","submitted_at":"2026-05-29T14:17:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DecMem proposes a decoupled memory system using sparse global and anchored local components to enable consistent minute-long controllable video generation in world models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30774","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CameraNoise: Enabling Faithful Camera Control in Video Diffusion through Geometry-Flow-Guided Noise Warping","primary_cat":"cs.CV","submitted_at":"2026-05-29T03:02:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CameraNoise embeds camera motion into the noise space of video diffusion via Geometry-guided Reprojection Flow and noise warping to achieve faithful trajectory control while preserving the diffusion prior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30519","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniMem: Scalable and Adaptive Memory Retrieval for Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T19:56:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniMem enables scalable long video generation via adaptive sparse KV retrieval that addresses local bias and union explosion while preserving explicit historical access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30346","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YoCausal: How Far is Video Generation from World Model? A Causality Perspective","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:59:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"YoCausal benchmark shows video diffusion models detect the arrow of time but lack genuine causal understanding relative to humans.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30317","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VPG: Visual Prefix Guidance for Autoregressive Image and Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:55:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VPG is a training-free inference-time guidance technique that improves autoregressive image and video generation by contrasting model outputs under generated versus corrupted prefixes to strengthen next-step support for the prefix.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30263","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"minWM: A Full-Stack Open-Source Framework for Real-Time Interactive Video World Models","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:27:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"minWM supplies an end-to-end pipeline that fine-tunes bidirectional T2V/TI2V models with camera control then distills them via Causal Forcing into few-step autoregressive generators for low-latency rollout.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30083","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Future Forcing: Future-aware Training-free KV Cache Policy for Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T15:30:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Future Forcing constructs a future query proxy from historical pre-RoPE statistics to score and merge KV tokens, improving subject consistency by up to 1.49 on VBench-Long for 60s AR video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30045","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenEraser: Generalizable Video Object Removal via Balanced Text-Mask Guidance and Decoupled Locator-Preserver","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:58:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenEraser proposes MC-MoE with bipartite text guidance, LD-CFG fusion, and a decoupled locator-preserver architecture for generalizable video object and effect removal, claiming 2.16 dB and 1.44 dB gains on ROSE and VOR-Eval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29509","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KGEdit: Ambiguity-Aware Knowledge Graphs for Training-Free Precise Video Generation and Editing","primary_cat":"cs.CV","submitted_at":"2026-05-28T07:31:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"KGEdit uses an ambiguity-aware knowledge graph and structured injection modules to improve semantic control and temporal consistency in training-free text-to-video diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23903","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geo-Align: Video Generation Alignment via Metric Geometry Reward","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:59:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Geo-Align applies RL with a perceptual reward derived from 3D camera trajectory estimation to improve controllability and fidelity in video generation without paired training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23878","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LaMo: Self-Supervised Latent Motion Priors for Physical Realism in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:34:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaMo adds self-supervised latent motion priors via a motion drift loss during training and motion prior guidance during sampling to boost physical fidelity in video diffusion models like CogVideoX.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23699","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRONOS: Benchmarking Counterfactual Physical Consistency in Video Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T14:51:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CRONOS benchmark shows recent open-source video generators fail to preserve physical consistency under controlled changes to viewpoint, scene, object category, and appearance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23610","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EM-Vid: Training-Free Entity-Centric Memory for Efficient and Consistent Multi-Shot Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T13:20:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EM-Vid introduces an entity-centric latent patch memory bank with sparse token conditioning and budgeted updates for training-free consistent multi-shot video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23522","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Precise: SDE-Consistent Stochastic Sampling for RL Post-Training of Flow-Matching Models","primary_cat":"cs.LG","submitted_at":"2026-05-22T11:37:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Precise is a new SDE-consistent stochastic sampler that balances exploration and stability for RL post-training of flow-matching models via a novel posterior-mean approximation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23458","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One-Forcing: Towards Stable One-Step Autoregressive Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T10:16:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"One-Forcing augments DMD with a GAN loss to enable stable one-step causal autoregressive video generation, reporting a VBench score of 83.76 as SOTA among one-step methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23445","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DFSAttn: Dynamic Fine-grained Sparse Attention for Efficient Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T09:58:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DFSAttn is a training-free framework for dynamic fine-grained sparse attention in video DiTs that achieves up to 2.1x speedup while preserving generation quality via Hilbert reordering, hierarchical scoring, and adaptive caching.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23381","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VDE: Training-Free Accelerating Rectified Flow Model via Velocity Decomposition and Estimation","primary_cat":"cs.CV","submitted_at":"2026-05-22T08:50:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VDE accelerates rectified flow models like Flux by 3.22x with LPIPS of 0.069 via velocity decomposition into parallel/orthogonal components plus periodic full-pass anchoring.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23345","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCOPE: Simulating Cross-game Operations in Playable Environments for FPS World Models","primary_cat":"cs.CV","submitted_at":"2026-05-22T08:06:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCOPE adds per-pixel action conditioning to pretrained video diffusion models and releases the CrossFPS multi-game dataset to support cross-game FPS world model simulation with zero-shot transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23271","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvalVerse: Pipeline-Aware and Expert-Calibrated Benchmarking for Professional Cinematic Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-22T06:22:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EvalVerse is a pipeline-aware benchmark that distills expert cinematic judgments into VLMs to assess 'goodness' metrics like aesthetics and multi-shot coherence alongside basic prompt adherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22996","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoMoGen: COntrollable MOtion Dynamics and Interactions with Mask-Guided Video GENeration","primary_cat":"cs.CV","submitted_at":"2026-05-21T19:51:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CoMoGen generates controllable interactive video from mask sequences and images by encoding masks into MMDiT via MaskAdapter and LoRA on motion layers, claiming SOTA motion fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22344","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bernini: Latent Semantic Planning for Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-21T11:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bernini is a framework that uses an MLLM planner to output semantic representations for a DiT renderer to generate or edit videos, reporting SOTA benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22144","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One Sentence, One Drama: Personalized Short-Form Drama Generation via Multi-Agent Systems","primary_cat":"cs.CV","submitted_at":"2026-05-21T08:15:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A hierarchical multi-agent framework converts a single sentence into a short drama using debate-based scripting, 3D-grounded first frames for spatial consistency, and multi-stage reviewer loops.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22051","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EasyVFX: Frequency-Driven Decoupling for Resource-Efficient VFX Generation","primary_cat":"cs.CV","submitted_at":"2026-05-21T06:38:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EasyVFX decouples VFX generation via frequency-aware Mixture-of-Experts and test-time training to achieve realistic effects with limited resources.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22015","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ORBIS: Output-Guided Token Reduction with Distribution-Aware Matching for Video Diffusion Acceleration","primary_cat":"cs.CV","submitted_at":"2026-05-21T05:23:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ORBIS uses output-guided token reduction and DATM to achieve 2x higher token reduction than AsymRnR, with up to 4.5x speedup and 79.3% energy savings versus A100 GPU for video DiT models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21072","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Q-ARVD: Quantizing Autoregressive Video Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:58:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Q-ARVD introduces final-quality-aware frame weighting and outlier-aware adaptive dual-scale quantization to enable accurate low-bit inference for autoregressive video diffusion models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21042","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Video Generation: Shaping Video Generation Across Time and Space","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:24:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DVG dynamically selects content-aware spatio-temporal acceleration strategies for diffusion-based video generation, delivering up to 7x speedup with near-lossless quality on models like HunyuanVideo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20910","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlowLong: Inference-time Long Video Generation via Manifold-constrained Tweedie Matching","primary_cat":"cs.CV","submitted_at":"2026-05-20T08:55:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlowLong generates videos several times longer than native model windows by blending adjacent predictions with Tweedie matching to enforce manifold and temporal consistency while using stochastic noise injection early and deterministic sampling later.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20795","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Semantics Survive the Connector? Diagnosing VLM-to-DiT Alignment in Video Editing","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:42:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20708","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Cross-Layer Information Routing in Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-20T05:07:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20624","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Accelerating Video Inverse Problem Solvers with Autoregressive Diffusion Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T02:16:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AVIS applies autoregressive diffusion models to video inverse problems by streaming restoration with measurement-consistent initialization, reducing latency from 114s to 4s and raising throughput to 1.18 FPS (or 5.91 FPS in the Flash variant).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20183","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MSAVBench: Towards Comprehensive and Reliable Evaluation of Multi-Shot Audio-Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-19T17:59:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19957","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"World-Ego Modeling for Long-Horizon Evolution in Hybrid Embodied Tasks","primary_cat":"cs.CV","submitted_at":"2026-05-19T15:10:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes World-Ego Modeling with WEM using CP-MoE diffusion and a new HTEWorld benchmark, claiming SOTA on hybrid navigation-manipulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19728","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Aero-World: Action-Conditioned Aerial Video Generation from Inertial Controls","primary_cat":"cs.CV","submitted_at":"2026-05-19T12:02:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Aero-World adapts a pretrained latent diffusion transformer for action-conditioned aerial video generation by injecting inertial action tokens and using a frozen latent-space Physics Probe for inertial consistency supervision during LoRA finetuning, with a new AeroBench benchmark showing improved AA","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19382","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PRISM: A Benchmark for Programmatic Spatial-Temporal Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-19T05:28:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PRISM benchmark of over 10k pairs shows LLMs have a 41% average drop from code execution success to spatial correctness in programmatic video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19320","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TextAlign: Preference Alignment for Text Rendering with Hierarchical Rewards","primary_cat":"cs.CV","submitted_at":"2026-05-19T03:55:59+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18467","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"InstructAV2AV: Instruction-Guided Audio-Video Joint Editing","primary_cat":"cs.CV","submitted_at":"2026-05-18T14:27:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"InstructAV2AV is an end-to-end instruction-guided audio-video joint editing model that adapts a pre-trained backbone with gated attention and two-stage training, outperforming prior methods on 11 metrics after building the InsAVE-80K dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18396","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NEWTON: Agentic Planning for Physically Grounded Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T13:42:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NEWTON improves physical accuracy in video generation by deploying a trainable planner that coordinates physics-aware tools and a verifier, raising joint accuracy on VideoPhy-2 without altering the base generators.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18365","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoFlow: Enforcing Implicit Geometric Consistency in Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T13:17:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GeoFlow adds a geometry-consistency reward based on rigid camera flow and object appearance preservation, integrated via reinforcement fine-tuning to improve geometric coherence in video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18233","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Train-Free Infinite-Frame Generation for Consistent Long Videos","primary_cat":"cs.CV","submitted_at":"2026-05-18T11:28:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIGA introduces two-stage alignment to close train-inference gaps and dual consistency enhancement via self-reflection and long-range guidance to achieve SOTA temporal consistency in infinite-frame video generation on VBench and NarrLV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17837","ref_index":149,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temporal Aware Pruning for Efficient Diffusion-based Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-18T04:18:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TAPE applies temporal-aware token pruning with smoothing, reselection, and timestep scheduling to speed up video diffusion models while preserving visual fidelity and coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17488","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Omni-Customizer: End-to-End MultiModal Customization for Joint Audio-Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-17T14:56:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Omni-Customizer proposes an end-to-end framework using Omni-Context Fusion, Masked TTS Cross-Attention, Semantic-Anchored Multimodal RoPE, and specialized training curricula to achieve precise multimodal identity binding in joint audio-video generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17248","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Image-to-Video Diffusion: From Foundations to Open Frontiers","primary_cat":"cs.CV","submitted_at":"2026-05-17T04:10:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey that organizes diffusion image-to-video methods into a taxonomy, distills core designs in condition encoding, temporal modeling, noise prior, and upsampling, and discusses applications plus challenges.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"VideoCrafter2 [63] provide toolkits supporting both text and image conditions. For large models, Open-Sora [64] and Open-Sora Plan [65] demonstrate that DiT can in- tegrate text conditioning seamlessly across billions of parameters. Industrial-scale implementations further ver- ify this route, with CogVideoX [66] employing expert adaptive layer normalization for text-video fusion, Hun- yuanVideo [67] reaching 13 billion parameters through dual-stream processing, and Wan [54] and Seedance [40] pushing model scale even further. ◆Image+Motion:Motion provides an explicit motion specification beyond appearance (e.g., motion fields, op- 2 Reference image Image only Image + Text Image + Motion Image + Audio Image + Camera … … … … … Multi-condition …"},{"citing_arxiv_id":"2605.17019","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StreamingEffect: Real-Time Human-Centric Video Effect Generation","primary_cat":"cs.CV","submitted_at":"2026-05-16T14:45:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StreamingEffect enables real-time 720p human-centric video effect generation on one GPU via teacher-student distillation, keyframe control, and a new 130K video dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16789","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Accelerating Rectified Flow Models via Trajectory-Aware Caching","primary_cat":"cs.CV","submitted_at":"2026-05-16T03:44:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TACache accelerates rectified flow sampling up to 4.14x for text-to-image and 2.11x for text-to-video via offline skip scheduling from cumulative variation thresholds and online velocity reconstruction using historical orthogonal directions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16736","ref_index":42,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CAB: Accelerating Flow and Diffusion Sampling via Rectification and Corrected Adams-Bashforth","primary_cat":"cs.CV","submitted_at":"2026-05-16T01:16:18+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAB accelerates flow and diffusion sampling via rectification to a common coordinate system followed by a corrected Adams-Bashforth multistep method that achieves third-order local truncation error while improving quality at low NFEs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16713","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GeoWorld-VLM: Geometry from World Models for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-15T23:52:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}