{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:7ZPEYLCNXN25DBLLRY7K3WOPVX","short_pith_number":"pith:7ZPEYLCN","schema_version":"1.0","canonical_sha256":"fe5e4c2c4dbb75d1856b8e3eadd9cfadea2a131a47c7adf1e52372c690a70f27","source":{"kind":"arxiv","id":"2412.14169","version":2},"attestation_state":"computed","paper":{"title":"Autoregressive Video Generation without Vector Quantization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Haiwen Diao, Haoge Deng, Huchuan Lu, Shiguang Shan, Ting Pan, Xinlong Wang, Yonggang Qi, Yufeng Cui, Zhengxiong Luo","submitted_at":"2024-12-18T18:59:53Z","abstract_excerpt":"This paper presents a novel approach that enables autoregressive video generation with high efficiency. We propose to reformulate the video generation problem as a non-quantized autoregressive modeling of temporal frame-by-frame prediction and spatial set-by-set prediction. Unlike raster-scan prediction in prior autoregressive models or joint distribution modeling of fixed-length tokens in diffusion models, our approach maintains the causal property of GPT-style models for flexible in-context capabilities, while leveraging bidirectional modeling within individual frames for efficiency. With th"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2412.14169","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:59:53Z","cross_cats_sorted":[],"title_canon_sha256":"b9c8d759d1cd8ab10e1d038f59278a2d6078dc29101a1a8ed9c5069dd200ab32","abstract_canon_sha256":"42d3b0a85d1c104df17025883ce4255539ab4b297994ec1e168e725c9d51b1b8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.741722Z","signature_b64":"OYayKDz17FFpzA9Irge8OCss3OpXlhAAUljGSxg6pEZAzklPyYrRfppDty2Sz3BJcOwYI2zgWO28Cv8YBZ4MDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fe5e4c2c4dbb75d1856b8e3eadd9cfadea2a131a47c7adf1e52372c690a70f27","last_reissued_at":"2026-05-17T23:38:13.741021Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.741021Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Autoregressive Video Generation without Vector Quantization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Haiwen Diao, Haoge Deng, Huchuan Lu, Shiguang Shan, Ting Pan, Xinlong Wang, Yonggang Qi, Yufeng Cui, Zhengxiong Luo","submitted_at":"2024-12-18T18:59:53Z","abstract_excerpt":"This paper presents a novel approach that enables autoregressive video generation with high efficiency. We propose to reformulate the video generation problem as a non-quantized autoregressive modeling of temporal frame-by-frame prediction and spatial set-by-set prediction. Unlike raster-scan prediction in prior autoregressive models or joint distribution modeling of fixed-length tokens in diffusion models, our approach maintains the causal property of GPT-style models for flexible in-context capabilities, while leveraging bidirectional modeling within individual frames for efficiency. With th"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"NOVA surpasses prior autoregressive video models in data efficiency, inference speed, visual fidelity, and video fluency, even with a much smaller model capacity, i.e., 0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models in text-to-image generation tasks, with a significantly lower training cost.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That non-quantized autoregressive modeling via temporal frame-by-frame prediction and spatial set-by-set prediction can preserve sufficient visual information and coherence without the discretization step of vector quantization.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"NOVA reformulates video generation as non-quantized autoregressive frame-by-frame temporal prediction combined with set-by-set spatial prediction, outperforming prior AR video models and some diffusion models in efficiency and quality.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6a28aa8ea4d1ca335decc9c745608a854e465cdbcbdd327cb0a7ee77b3ee2a9e"},"source":{"id":"2412.14169","kind":"arxiv","version":2},"verdict":{"id":"786057d5-cf12-45f8-86eb-b80190d51198","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T15:02:54.272971Z","strongest_claim":"NOVA surpasses prior autoregressive video models in data efficiency, inference speed, visual fidelity, and video fluency, even with a much smaller model capacity, i.e., 0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models in text-to-image generation tasks, with a significantly lower training cost.","one_line_summary":"NOVA reformulates video generation as non-quantized autoregressive frame-by-frame temporal prediction combined with set-by-set spatial prediction, outperforming prior AR video models and some diffusion models in efficiency and quality.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That non-quantized autoregressive modeling via temporal frame-by-frame prediction and spatial set-by-set prediction can preserve sufficient visual information and coherence without the discretization step of vector quantization.","pith_extraction_headline":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame."},"references":{"count":36,"sample":[{"doi":"","year":null,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","ref_index":1,"cited_arxiv_id":"2305.10403","is_internal_anchor":true},{"doi":"","year":null,"title":"Imagen 3.arXiv preprint arXiv:2408.07009, 2024","work_id":"a1dd317f-8300-4a79-a1d0-92ddd93fa983","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","ref_index":3,"cited_arxiv_id":"2311.15127","is_internal_anchor":true},{"doi":"","year":null,"title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","work_id":"2661b9a6-25cc-41a1-8100-612d2b801289","ref_index":4,"cited_arxiv_id":"2405.09818","is_internal_anchor":true},{"doi":"","year":null,"title":"Muse: Text-to-image generation via masked generative transformers","work_id":"ad8925f8-72d8-4ac4-88b8-027e08b46103","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":36,"snapshot_sha256":"af959daab140b38a113ec4657b3b0246e069b569d8217348bd3e76a38c0b96ee","internal_anchors":21},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2412.14169","created_at":"2026-05-17T23:38:13.741142+00:00"},{"alias_kind":"arxiv_version","alias_value":"2412.14169v2","created_at":"2026-05-17T23:38:13.741142+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.14169","created_at":"2026-05-17T23:38:13.741142+00:00"},{"alias_kind":"pith_short_12","alias_value":"7ZPEYLCNXN25","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"7ZPEYLCNXN25DBLL","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"7ZPEYLCN","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":17,"sample":[{"citing_arxiv_id":"2505.05472","citing_title":"Mogao: An Omni Foundation Model for Interleaved Multi-Modal Generation","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2512.04678","citing_title":"Reward Forcing: Efficient Streaming Video Generation with Rewarded Distribution Matching Distillation","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2602.07775","citing_title":"Rolling Sink: Bridging Limited-Horizon Training and Open-Ended Testing in Autoregressive Video Diffusion","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2510.02283","citing_title":"Self-Forcing++: Towards Minute-Scale High-Quality Video Generation","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2602.13669","citing_title":"EchoTorrent: Towards Swift, Sustained, and Streaming Multi-Modal Video Generation","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2603.00110","citing_title":"Learning Physics from Pretrained Video Models: A Multimodal Continuous and Sequential World Interaction Models for Robotic Manipulation","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2503.00200","citing_title":"Unified Video Action Model","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06356","citing_title":"SwiftI2V: Efficient High-Resolution Image-to-Video Generation via Conditional Segment-wise Generation","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03849","citing_title":"Stream-R1: Reliability-Perplexity Aware Reward Distillation for Streaming Video Generation","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06356","citing_title":"SwiftI2V: Efficient High-Resolution Image-to-Video Generation via Conditional Segment-wise Generation","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13030","citing_title":"Generative Refinement Networks for Visual Synthesis","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10103","citing_title":"Long-Horizon Streaming Video Generation via Hybrid Attention with Decoupled Distillation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06966","citing_title":"MAR-GRPO: Stabilized GRPO for AR-diffusion Hybrid Image Generation","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2501.03575","citing_title":"Cosmos World Foundation Model Platform for Physical AI","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15911","citing_title":"Efficient Video Diffusion Models: Advancements and Challenges","ref_index":252,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18518","citing_title":"UDM-GRPO: Stable and Efficient Group Relative Policy Optimization for Uniform Discrete Diffusion Models","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04461","citing_title":"Stream-T1: Test-Time Scaling for Streaming Video Generation","ref_index":4,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX","json":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX.json","graph_json":"https://pith.science/api/pith-number/7ZPEYLCNXN25DBLLRY7K3WOPVX/graph.json","events_json":"https://pith.science/api/pith-number/7ZPEYLCNXN25DBLLRY7K3WOPVX/events.json","paper":"https://pith.science/paper/7ZPEYLCN"},"agent_actions":{"view_html":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX","download_json":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX.json","view_paper":"https://pith.science/paper/7ZPEYLCN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2412.14169&json=true","fetch_graph":"https://pith.science/api/pith-number/7ZPEYLCNXN25DBLLRY7K3WOPVX/graph.json","fetch_events":"https://pith.science/api/pith-number/7ZPEYLCNXN25DBLLRY7K3WOPVX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX/action/storage_attestation","attest_author":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX/action/author_attestation","sign_citation":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX/action/citation_signature","submit_replication":"https://pith.science/pith/7ZPEYLCNXN25DBLLRY7K3WOPVX/action/replication_record"}},"created_at":"2026-05-17T23:38:13.741142+00:00","updated_at":"2026-05-17T23:38:13.741142+00:00"}