{"work":{"id":"703c74c3-fa5e-455c-8c00-697c83511fcf","openalex_id":null,"doi":null,"arxiv_id":"2104.10157","raw_key":null,"title":"VideoGPT: Video Generation using VQ-VAE and Transformers","authors":null,"authors_text":"Wilson Yan, Yunzhi Zhang, Pieter Abbeel, Aravind Srinivas","year":2021,"venue":"cs.CV","abstract":"We present VideoGPT: a conceptually simple architecture for scaling likelihood based generative modeling to natural videos. VideoGPT uses VQ-VAE that learns downsampled discrete latent representations of a raw video by employing 3D convolutions and axial self-attention. A simple GPT-like architecture is then used to autoregressively model the discrete latents using spatio-temporal position encodings. Despite the simplicity in formulation and ease of training, our architecture is able to generate samples competitive with state-of-the-art GAN models for video generation on the BAIR Robot dataset, and generate high fidelity natural videos from UCF-101 and Tumbler GIF Dataset (TGIF). We hope our proposed architecture serves as a reproducible reference for a minimalistic implementation of transformer based video generation models. Samples and code are available at https://wilson1yan.github.io/videogpt/index.html","external_url":"https://arxiv.org/abs/2104.10157","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-23T17:23:14.982783+00:00","pith_arxiv_id":"2104.10157","created_at":"2026-05-09T06:40:39.456394+00:00","updated_at":"2026-05-23T17:23:14.982783+00:00","title_quality_ok":true,"display_title":"VideoGPT: Video Generation using VQ-VAE and Transformers","render_title":"VideoGPT: Video Generation using VQ-VAE and Transformers"},"hub":{"state":{"work_id":"703c74c3-fa5e-455c-8c00-697c83511fcf","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":42,"external_cited_by_count":null,"distinct_field_count":8,"first_pith_cited_at":"2021-12-20T18:55:25+00:00","last_pith_cited_at":"2026-05-20T17:59:10+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-28T17:38:49.616545+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":12},{"context_role":"baseline","n":2}],"polarity_counts":[{"context_polarity":"background","n":11},{"context_polarity":"baseline","n":2},{"context_polarity":"unclear","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}