{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:ADJSTXNEHHQEQFZIW2RD2VL445","short_pith_number":"pith:ADJSTXNE","schema_version":"1.0","canonical_sha256":"00d329dda439e0481728b6a23d557ce7605d4efdcd66dadac49cb2680477fd3f","source":{"kind":"arxiv","id":"2509.22622","version":2},"attestation_state":"computed","paper":{"title":"LongLive: Real-time Interactive Long Video Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"LongLive turns a short-clip autoregressive model into a real-time system that generates up to 240-second videos at 20.7 FPS while accepting streaming prompt changes.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Enze Xie, Muyang Li, Ruihang Chu, Shuai Yang, Song Han, Wei Huang, Xianbang Wang, Yao Lu, Yicheng Xiao, Yingcong Chen, Yukang Chen, Yuyang Zhao","submitted_at":"2025-09-26T17:48:24Z","abstract_excerpt":"We present LongLive, a frame-level autoregressive (AR) framework for real-time and interactive long video generation. Long video generation presents challenges in both efficiency and quality. Diffusion and Diffusion-Forcing models can produce high-quality videos but suffer from low efficiency due to bidirectional attention. Causal attention AR models support KV caching for faster inference, but often degrade in quality on long videos due to memory challenges during long-video training. In addition, beyond static prompt-based generation, interactive capabilities, such as streaming prompt inputs"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2509.22622","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-26T17:48:24Z","cross_cats_sorted":[],"title_canon_sha256":"638303bcc893c10be0132226122bda9d4b5bc7db58ce39c77753e57801d03740","abstract_canon_sha256":"c515fbbf9e9c41cccec4793b0ed0a083e20133de3c46427cba0e8bf131f96a66"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.619882Z","signature_b64":"AOOi4ktb1AHP3M9sUDCfsHbyUG2FdU+qiCBOlyYovZiO+P4RM2MaM9IFRiXzyCdkAMDgJEDBO1nDe7KvCsojAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"00d329dda439e0481728b6a23d557ce7605d4efdcd66dadac49cb2680477fd3f","last_reissued_at":"2026-05-17T23:38:53.619263Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.619263Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LongLive: Real-time Interactive Long Video Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"LongLive turns a short-clip autoregressive model into a real-time system that generates up to 240-second videos at 20.7 FPS while accepting streaming prompt changes.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Enze Xie, Muyang Li, Ruihang Chu, Shuai Yang, Song Han, Wei Huang, Xianbang Wang, Yao Lu, Yicheng Xiao, Yingcong Chen, Yukang Chen, Yuyang Zhao","submitted_at":"2025-09-26T17:48:24Z","abstract_excerpt":"We present LongLive, a frame-level autoregressive (AR) framework for real-time and interactive long video generation. Long video generation presents challenges in both efficiency and quality. Diffusion and Diffusion-Forcing models can produce high-quality videos but suffer from low efficiency due to bidirectional attention. Causal attention AR models support KV caching for faster inference, but often degrade in quality on long videos due to memory challenges during long-video training. In addition, beyond static prompt-based generation, interactive capabilities, such as streaming prompt inputs"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"With these key designs, LongLive fine-tunes a 1.3B-parameter short-clip model to minute-long generation in just 32 GPU-days. At inference, LongLive sustains 20.7 FPS on a single NVIDIA H100, achieves strong performance on VBench in both short and long videos. LongLive supports up to 240-second videos on a single H100 GPU.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that KV-recache combined with short-window attention and frame sink maintains visual consistency and semantic adherence across prompt transitions and long sequences without introducing cumulative artifacts or drift, as this is presented as sufficient based on the described training alignment.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LongLive is a causal autoregressive video generator that produces up to 240-second interactive videos at 20.7 FPS on one H100 GPU after 32 GPU-days of fine-tuning from a 1.3B short-clip model.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LongLive turns a short-clip autoregressive model into a real-time system that generates up to 240-second videos at 20.7 FPS while accepting streaming prompt changes.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6da6b3874afa7c3e10e39fc97bb34afa6d505f8e14dc39375d2caecbe5845c07"},"source":{"id":"2509.22622","kind":"arxiv","version":2},"verdict":{"id":"dac721f2-e321-40a9-90f5-04b94f365d3a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T03:48:41.959401Z","strongest_claim":"With these key designs, LongLive fine-tunes a 1.3B-parameter short-clip model to minute-long generation in just 32 GPU-days. At inference, LongLive sustains 20.7 FPS on a single NVIDIA H100, achieves strong performance on VBench in both short and long videos. LongLive supports up to 240-second videos on a single H100 GPU.","one_line_summary":"LongLive is a causal autoregressive video generator that produces up to 240-second interactive videos at 20.7 FPS on one H100 GPU after 32 GPU-days of fine-tuning from a 1.3B short-clip model.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that KV-recache combined with short-window attention and frame sink maintains visual consistency and semantic adherence across prompt transitions and long sequences without introducing cumulative artifacts or drift, as this is presented as sufficient based on the described training alignment.","pith_extraction_headline":"LongLive turns a short-clip autoregressive model into a real-time system that generates up to 240-second videos at 20.7 FPS while accepting streaming prompt changes."},"references":{"count":108,"sample":[{"doi":"","year":2024,"title":"Diffusion forcing: Next-token prediction meets full-sequence diffusion","work_id":"1fb5fc55-61a5-425d-b272-e8246265015a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"SkyReels-V2: Infinite-length Film Generative Model","work_id":"2ce11350-273e-4f0d-ae78-292aa3151060","ref_index":2,"cited_arxiv_id":"2504.13074","is_internal_anchor":true},{"doi":"","year":2025,"title":"Sana-video: Efficient video generation with block linear diffusion transformer","work_id":"37d92c42-ae43-4ef7-adfa-5faf1c48d1a0","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"SEINE: short-to-long video diffusion model for generative transition and prediction","work_id":"fc5b4ca4-5045-4f2e-b2c8-261b55f226f7","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Longlora: Efficient fine-tuning of long-context large language models","work_id":"7f84c8d7-2e59-4005-8191-68c6a956a564","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":108,"snapshot_sha256":"939a7220e119ca88778637881b74486a777bfbd7ac7e91910b324b95787baf54","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"bf6fc593f4075a74190b85611e58d722feecf5b8e6f6537fe49f9a0634ba4ae8"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.22622","created_at":"2026-05-17T23:38:53.619383+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.22622v2","created_at":"2026-05-17T23:38:53.619383+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.22622","created_at":"2026-05-17T23:38:53.619383+00:00"},{"alias_kind":"pith_short_12","alias_value":"ADJSTXNEHHQE","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"ADJSTXNEHHQEQFZI","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"ADJSTXNE","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":45,"internal_anchor_count":45,"sample":[{"citing_arxiv_id":"2605.11596","citing_title":"HorizonDrive: Self-Corrective Autoregressive World Model for Long-horizon Driving Simulation","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2602.02214","citing_title":"Causal Forcing: Autoregressive Diffusion Distillation Done Right for High-Quality Real-Time Interactive Video Generation","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2603.08403","citing_title":"SPIRAL: Self-Evolving Action-Conditioned Video Generation via Reflective Planning Agents","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22144","citing_title":"One Sentence, One Drama: Personalized Short-Form Drama Generation via Multi-Agent Systems","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22718","citing_title":"WorldKV: Efficient World Memory with World Retrieval and Compression","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2602.02214","citing_title":"Causal Forcing: Autoregressive Diffusion Distillation Done Right for High-Quality Real-Time Interactive Video Generation","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2602.23058","citing_title":"GeoWorld: Geometric World Models","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20476","citing_title":"Goodbye Drift: Anchored Tree Sampling for Long-Horizon Video-to-Video Generation","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21028","citing_title":"DySink: Dynamic Frame Sinks for Autoregressive Long Video Generation","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20910","citing_title":"FlowLong: Inference-time Long Video Generation via Manifold-constrained Tweedie Matching","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21072","citing_title":"Q-ARVD: Quantizing Autoregressive Video Diffusion Models","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21466","citing_title":"StreamGVE: Training-Free Video Editing via Few-Step Streaming Video Generation","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15824","citing_title":"FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16003","citing_title":"Echo-Forcing: A Scene Memory Framework for Interactive Long Video Generation","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18733","citing_title":"Advancing Narrative Long Video Generation via Training-Free Identity-Aware Memory","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18601","citing_title":"Incantation: Natural Language as the Action Interface for Multi-Entity Video World Models","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19242","citing_title":"PhyWorld: Physics-Faithful World Model for Video Generation","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2512.04677","citing_title":"Live Avatar: Streaming Real-time Audio-Driven Avatar Generation with Infinite Length","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2512.04678","citing_title":"Reward Forcing: Efficient Streaming Video Generation with Rewarded Distribution Matching Distillation","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2601.20540","citing_title":"Advancing Open-source World Models","ref_index":85,"is_internal_anchor":true},{"citing_arxiv_id":"2602.07775","citing_title":"Rolling Sink: Bridging Limited-Horizon Training and Open-Ended Testing in Autoregressive Video Diffusion","ref_index":99,"is_internal_anchor":true},{"citing_arxiv_id":"2602.13669","citing_title":"EchoTorrent: Towards Swift, Sustained, and Streaming Multi-Modal Video Generation","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2512.14614","citing_title":"WorldPlay: Towards Long-Term Geometric Consistency for Real-Time Interactive World Modeling","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15178","citing_title":"SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01725","citing_title":"Motion-Aware Caching for Efficient Autoregressive Video Generation","ref_index":71,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445","json":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445.json","graph_json":"https://pith.science/api/pith-number/ADJSTXNEHHQEQFZIW2RD2VL445/graph.json","events_json":"https://pith.science/api/pith-number/ADJSTXNEHHQEQFZIW2RD2VL445/events.json","paper":"https://pith.science/paper/ADJSTXNE"},"agent_actions":{"view_html":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445","download_json":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445.json","view_paper":"https://pith.science/paper/ADJSTXNE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.22622&json=true","fetch_graph":"https://pith.science/api/pith-number/ADJSTXNEHHQEQFZIW2RD2VL445/graph.json","fetch_events":"https://pith.science/api/pith-number/ADJSTXNEHHQEQFZIW2RD2VL445/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445/action/storage_attestation","attest_author":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445/action/author_attestation","sign_citation":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445/action/citation_signature","submit_replication":"https://pith.science/pith/ADJSTXNEHHQEQFZIW2RD2VL445/action/replication_record"}},"created_at":"2026-05-17T23:38:53.619383+00:00","updated_at":"2026-05-17T23:38:53.619383+00:00"}