{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:7ZPEYLCNXN25DBLLRY7K3WOPVX","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"42d3b0a85d1c104df17025883ce4255539ab4b297994ec1e168e725c9d51b1b8","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:59:53Z","title_canon_sha256":"b9c8d759d1cd8ab10e1d038f59278a2d6078dc29101a1a8ed9c5069dd200ab32"},"schema_version":"1.0","source":{"id":"2412.14169","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.14169","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2412.14169v2","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.14169","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"7ZPEYLCNXN25","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"7ZPEYLCNXN25DBLL","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"7ZPEYLCN","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:91a1f75a6a577fb478e6f63ecca50eb3572fb5610b8705b502c15ea31e60afef","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"NOVA surpasses prior autoregressive video models in data efficiency, inference speed, visual fidelity, and video fluency, even with a much smaller model capacity, i.e., 0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models in text-to-image generation tasks, with a significantly lower training cost."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That non-quantized autoregressive modeling via temporal frame-by-frame prediction and spatial set-by-set prediction can preserve sufficient visual information and coherence without the discretization step of vector quantization."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"NOVA reformulates video generation as non-quantized autoregressive frame-by-frame temporal prediction combined with set-by-set spatial prediction, outperforming prior AR video models and some diffusion models in efficiency and quality."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame."}],"snapshot_sha256":"6a28aa8ea4d1ca335decc9c745608a854e465cdbcbdd327cb0a7ee77b3ee2a9e"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This paper presents a novel approach that enables autoregressive video generation with high efficiency. We propose to reformulate the video generation problem as a non-quantized autoregressive modeling of temporal frame-by-frame prediction and spatial set-by-set prediction. Unlike raster-scan prediction in prior autoregressive models or joint distribution modeling of fixed-length tokens in diffusion models, our approach maintains the causal property of GPT-style models for flexible in-context capabilities, while leveraging bidirectional modeling within individual frames for efficiency. With th","authors_text":"Haiwen Diao, Haoge Deng, Huchuan Lu, Shiguang Shan, Ting Pan, Xinlong Wang, Yonggang Qi, Yufeng Cui, Zhengxiong Luo","cross_cats":[],"headline":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:59:53Z","title":"Autoregressive Video Generation without Vector Quantization"},"references":{"count":36,"internal_anchors":21,"resolved_work":36,"sample":[{"cited_arxiv_id":"2305.10403","doi":"","is_internal_anchor":true,"ref_index":1,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Imagen 3.arXiv preprint arXiv:2408.07009, 2024","work_id":"a1dd317f-8300-4a79-a1d0-92ddd93fa983","year":null},{"cited_arxiv_id":"2311.15127","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets","work_id":"4f68eada-27e3-437a-a2fe-6e4ca524d0d3","year":null},{"cited_arxiv_id":"2405.09818","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Chameleon: Mixed-Modal Early-Fusion Foundation Models","work_id":"2661b9a6-25cc-41a1-8100-612d2b801289","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Muse: Text-to-image generation via masked generative transformers","work_id":"ad8925f8-72d8-4ac4-88b8-027e08b46103","year":null}],"snapshot_sha256":"af959daab140b38a113ec4657b3b0246e069b569d8217348bd3e76a38c0b96ee"},"source":{"id":"2412.14169","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T15:02:54.272971Z","id":"786057d5-cf12-45f8-86eb-b80190d51198","model_set":{"reader":"grok-4.3"},"one_line_summary":"NOVA reformulates video generation as non-quantized autoregressive frame-by-frame temporal prediction combined with set-by-set spatial prediction, outperforming prior AR video models and some diffusion models in efficiency and quality.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Video generation can be done autoregressively without vector quantization by predicting frames sequentially in time and sets spatially within each frame.","strongest_claim":"NOVA surpasses prior autoregressive video models in data efficiency, inference speed, visual fidelity, and video fluency, even with a much smaller model capacity, i.e., 0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models in text-to-image generation tasks, with a significantly lower training cost.","weakest_assumption":"That non-quantized autoregressive modeling via temporal frame-by-frame prediction and spatial set-by-set prediction can preserve sufficient visual information and coherence without the discretization step of vector quantization."}},"verdict_id":"786057d5-cf12-45f8-86eb-b80190d51198"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3d9fa5eac12165b85f5feb4427a0c117595f40276715c3d450e2ddfa403b3f69","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"42d3b0a85d1c104df17025883ce4255539ab4b297994ec1e168e725c9d51b1b8","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-12-18T18:59:53Z","title_canon_sha256":"b9c8d759d1cd8ab10e1d038f59278a2d6078dc29101a1a8ed9c5069dd200ab32"},"schema_version":"1.0","source":{"id":"2412.14169","kind":"arxiv","version":2}},"canonical_sha256":"fe5e4c2c4dbb75d1856b8e3eadd9cfadea2a131a47c7adf1e52372c690a70f27","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fe5e4c2c4dbb75d1856b8e3eadd9cfadea2a131a47c7adf1e52372c690a70f27","first_computed_at":"2026-05-17T23:38:13.741021Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.741021Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"OYayKDz17FFpzA9Irge8OCss3OpXlhAAUljGSxg6pEZAzklPyYrRfppDty2Sz3BJcOwYI2zgWO28Cv8YBZ4MDg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.741722Z","signed_message":"canonical_sha256_bytes"},"source_id":"2412.14169","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3d9fa5eac12165b85f5feb4427a0c117595f40276715c3d450e2ddfa403b3f69","sha256:91a1f75a6a577fb478e6f63ecca50eb3572fb5610b8705b502c15ea31e60afef"],"state_sha256":"87575ec6988a17d54c612ca4d62e875f8c358cbe9c7de232b88ff806d44e57ab"}