{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:E7QWGGV6CXMALJDIMCEO2VEJ4Y","short_pith_number":"pith:E7QWGGV6","schema_version":"1.0","canonical_sha256":"27e1631abe15d805a4686088ed5489e62adfb3e99fe8837159d27d3fe5014963","source":{"kind":"arxiv","id":"2503.19325","version":3},"attestation_state":"computed","paper":{"title":"Long-Context Autoregressive Video Modeling with Next-Frame Prediction","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"Asymmetric patchify kernels enable efficient long-context autoregressive video modeling by exploiting context redundancy.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Mike Zheng Shou, Weijia Mao, Yuchao Gu","submitted_at":"2025-03-25T03:38:06Z","abstract_excerpt":"Long-context video modeling is essential for enabling generative models to function as world simulators, as they must maintain temporal coherence over extended time spans. However, most existing models are trained on short clips, limiting their ability to capture long-range dependencies, even with test-time extrapolation. While training directly on long videos is a natural solution, the rapid growth of vision tokens makes it computationally prohibitive. To support exploring efficient long-context video modeling, we first establish a strong autoregressive baseline called Frame AutoRegressive (F"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2503.19325","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2025-03-25T03:38:06Z","cross_cats_sorted":[],"title_canon_sha256":"811f298b4d32c4e37375c796400b4703c34b61cfe15b69fa4277c08bd042ac0a","abstract_canon_sha256":"3d120646992c01d05c98f54ff776b534a9c790a9cda95b2d1c11ee23ae4f44ec"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.305577Z","signature_b64":"y0Gzs2eVjgyfcUOTf134WYv/hmk3H1K+8MFthTTkF4mxSqd6Dv+eIY23eR+D5g8FkJ9t6/xoG5PFJ70mhgR8Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"27e1631abe15d805a4686088ed5489e62adfb3e99fe8837159d27d3fe5014963","last_reissued_at":"2026-05-17T23:38:46.305077Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.305077Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Long-Context Autoregressive Video Modeling with Next-Frame Prediction","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"Asymmetric patchify kernels enable efficient long-context autoregressive video modeling by exploiting context redundancy.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Mike Zheng Shou, Weijia Mao, Yuchao Gu","submitted_at":"2025-03-25T03:38:06Z","abstract_excerpt":"Long-context video modeling is essential for enabling generative models to function as world simulators, as they must maintain temporal coherence over extended time spans. However, most existing models are trained on short clips, limiting their ability to capture long-range dependencies, even with test-time extrapolation. While training directly on long videos is a natural solution, the rapid growth of vision tokens makes it computationally prohibitive. To support exploring efficient long-context video modeling, we first establish a strong autoregressive baseline called Frame AutoRegressive (F"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our method achieves state-of-the-art results on both short and long video generation, providing an effective baseline for long-context autoregressive video modeling.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that video autoregression exhibits exploitable context redundancy where distant frames can safely use large asymmetric patchify kernels without losing critical temporal information needed for coherence.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FAR baseline plus asymmetric kernels for long short-term context modeling achieves SOTA short and long video generation in autoregressive setups.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Asymmetric patchify kernels enable efficient long-context autoregressive video modeling by exploiting context redundancy.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"65a74ae5bf4ee906459b1583a6ecc373b11f2aa0103b2aa0f9a3707363d1e5c6"},"source":{"id":"2503.19325","kind":"arxiv","version":3},"verdict":{"id":"fdbbb3af-5f6b-4b67-9121-70a825cda7fb","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T23:01:10.067285Z","strongest_claim":"Our method achieves state-of-the-art results on both short and long video generation, providing an effective baseline for long-context autoregressive video modeling.","one_line_summary":"FAR baseline plus asymmetric kernels for long short-term context modeling achieves SOTA short and long video generation in autoregressive setups.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that video autoregression exhibits exploitable context redundancy where distant frames can safely use large asymmetric patchify kernels without losing critical temporal information needed for coherence.","pith_extraction_headline":"Asymmetric patchify kernels enable efficient long-context autoregressive video modeling by exploiting context redundancy."},"references":{"count":60,"sample":[{"doi":"","year":2024,"title":"Video generation models as world simulators,","work_id":"36411502-be32-4aca-bb2e-6e69ad8e9542","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","ref_index":2,"cited_arxiv_id":"2503.20314","is_internal_anchor":true},{"doi":"","year":2025,"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","ref_index":3,"cited_arxiv_id":"2501.03575","is_internal_anchor":true},{"doi":"","year":2024,"title":"Freelong: Training-free long video generation with spectralblend temporal attention,","work_id":"2fe5a21b-1a98-4980-90ac-dcfa54e0e135","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Riflex: A free lunch for length extrapolation in video diffusion transformers","work_id":"027bc19e-1d61-407f-ae74-3f5cd543fa53","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":60,"snapshot_sha256":"d75ac10567aebd9e8e9460fe879d34a28cc0faca46b528215f2667e2fceaa7c1","internal_anchors":20},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b53828b0889de6d19908a9a70984139ee92bb15b8bcfaf408317a489bebf1e61"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2503.19325","created_at":"2026-05-17T23:38:46.305161+00:00"},{"alias_kind":"arxiv_version","alias_value":"2503.19325v3","created_at":"2026-05-17T23:38:46.305161+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2503.19325","created_at":"2026-05-17T23:38:46.305161+00:00"},{"alias_kind":"pith_short_12","alias_value":"E7QWGGV6CXMA","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"E7QWGGV6CXMALJDI","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"E7QWGGV6","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":22,"internal_anchor_count":22,"sample":[{"citing_arxiv_id":"2507.01099","citing_title":"Geometry-aware 4D Video Generation for Robot Manipulation","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20476","citing_title":"Goodbye Drift: Anchored Tree Sampling for Long-Horizon Video-to-Video Generation","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18733","citing_title":"Advancing Narrative Long Video Generation via Training-Free Identity-Aware Memory","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2510.26782","citing_title":"Cloning Deterministic Worlds: The Critical Role of Latent Geometry in Long-Horizon World Models","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2512.11321","citing_title":"KeyframeFace: Language-Driven Facial Animation via Semantic Keyframes","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2512.04678","citing_title":"Reward Forcing: Efficient Streaming Video Generation with Rewarded Distribution Matching Distillation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2509.25161","citing_title":"Rolling Forcing: Autoregressive Long Video Diffusion in Real Time","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2602.07775","citing_title":"Rolling Sink: Bridging Limited-Horizon Training and Open-Ended Testing in Autoregressive Video Diffusion","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2509.22622","citing_title":"LongLive: Real-time Interactive Long Video Generation","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14487","citing_title":"Head Forcing: Long Autoregressive Video Generation via Head Heterogeneity","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13724","citing_title":"AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12496","citing_title":"CausalCine: Real-Time Autoregressive Generation for Multi-Shot Video Narratives","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09442","citing_title":"SWIFT: Prompt-Adaptive Memory for Efficient Interactive Long Video Generation","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03849","citing_title":"Stream-R1: Reliability-Perplexity Aware Reward Distillation for Streaming Video Generation","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01929","citing_title":"Exploring Data-Free LoRA Transferability for Video Diffusion Models","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13036","citing_title":"Lyra 2.0: Explorable Generative 3D Worlds","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2506.08009","citing_title":"Self Forcing: Bridging the Train-Test Gap in Autoregressive Video Diffusion","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07209","citing_title":"INSPATIO-WORLD: A Real-Time 4D World Simulator via Spatiotemporal Autoregressive Modeling","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15911","citing_title":"Efficient Video Diffusion Models: Advancements and Challenges","ref_index":281,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16299","citing_title":"Repurposing 3D Generative Model for Autoregressive Layout Generation","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21221","citing_title":"Sparse Forcing: Native Trainable Sparse Attention for Real-time Autoregressive Diffusion Video Generation","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04461","citing_title":"Stream-T1: Test-Time Scaling for Streaming Video Generation","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y","json":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y.json","graph_json":"https://pith.science/api/pith-number/E7QWGGV6CXMALJDIMCEO2VEJ4Y/graph.json","events_json":"https://pith.science/api/pith-number/E7QWGGV6CXMALJDIMCEO2VEJ4Y/events.json","paper":"https://pith.science/paper/E7QWGGV6"},"agent_actions":{"view_html":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y","download_json":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y.json","view_paper":"https://pith.science/paper/E7QWGGV6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2503.19325&json=true","fetch_graph":"https://pith.science/api/pith-number/E7QWGGV6CXMALJDIMCEO2VEJ4Y/graph.json","fetch_events":"https://pith.science/api/pith-number/E7QWGGV6CXMALJDIMCEO2VEJ4Y/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y/action/timestamp_anchor","attest_storage":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y/action/storage_attestation","attest_author":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y/action/author_attestation","sign_citation":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y/action/citation_signature","submit_replication":"https://pith.science/pith/E7QWGGV6CXMALJDIMCEO2VEJ4Y/action/replication_record"}},"created_at":"2026-05-17T23:38:46.305161+00:00","updated_at":"2026-05-17T23:38:46.305161+00:00"}