{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:3SHOO2AF5NF3QVQJG53NMPI7Q5","short_pith_number":"pith:3SHOO2AF","schema_version":"1.0","canonical_sha256":"dc8ee76805eb4bb856093776d63d1f874ed946dd2752342b375e28402104b229","source":{"kind":"arxiv","id":"1711.00937","version":2},"attestation_state":"computed","paper":{"title":"Neural Discrete Representation Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Aaron van den Oord, Koray Kavukcuoglu, Oriol Vinyals","submitted_at":"2017-11-02T21:14:44Z","abstract_excerpt":"Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of \"posterior colla"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1711.00937","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-11-02T21:14:44Z","cross_cats_sorted":[],"title_canon_sha256":"4b02e7f75d8fb0a0110d8f7284cfeb667abde7dff14bfc151ea589e660ef7aa3","abstract_canon_sha256":"aabf5350c2855ebd931aa448ceb98e96ce9c512bc4a1a3564b8ce30e166d8a4a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:14:40.798036Z","signature_b64":"N6CibVAUIRG00C4di8Svb3G7yPDL+ARm0iXbsQHEH4IKihG9gTVWlZIluHpQjTv6NDun3EELOozkXzLgVwFPDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dc8ee76805eb4bb856093776d63d1f874ed946dd2752342b375e28402104b229","last_reissued_at":"2026-05-18T00:14:40.797504Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:14:40.797504Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Neural Discrete Representation Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Aaron van den Oord, Koray Kavukcuoglu, Oriol Vinyals","submitted_at":"2017-11-02T21:14:44Z","abstract_excerpt":"Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of \"posterior colla"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1711.00937","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1711.00937","created_at":"2026-05-18T00:14:40.797596+00:00"},{"alias_kind":"arxiv_version","alias_value":"1711.00937v2","created_at":"2026-05-18T00:14:40.797596+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1711.00937","created_at":"2026-05-18T00:14:40.797596+00:00"},{"alias_kind":"pith_short_12","alias_value":"3SHOO2AF5NF3","created_at":"2026-05-18T12:30:58.224056+00:00"},{"alias_kind":"pith_short_16","alias_value":"3SHOO2AF5NF3QVQJ","created_at":"2026-05-18T12:30:58.224056+00:00"},{"alias_kind":"pith_short_8","alias_value":"3SHOO2AF","created_at":"2026-05-18T12:30:58.224056+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":28,"internal_anchor_count":14,"sample":[{"citing_arxiv_id":"1907.06286","citing_title":"Autoencoding sensory substitution","ref_index":196,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21977","citing_title":"Video as Natural Augmentation: Towards Unified AI-Generated Image and Video Detection","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16165","citing_title":"Second-Order Multi-Level Variance Correction for Modality Competition in Multimodal Models","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16899","citing_title":"LASAR: Towards Spatio-temporal Reasoning with Latent Cognitive Map","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2508.09691","citing_title":"PaCo-FR: Patch-Pixel Aligned End-to-End Codebook Learning for Facial Representation Pre-training","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2110.04627","citing_title":"Vector-quantized Image Modeling with Improved VQGAN","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2305.02463","citing_title":"Shap-E: Generating Conditional 3D Implicit Functions","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2211.15657","citing_title":"Is Conditional Generative Modeling all you need for Decision-Making?","ref_index":207,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13789","citing_title":"ENSEMBITS: an alphabet of protein conformational ensembles","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2211.13221","citing_title":"Latent Video Diffusion Models for High-Fidelity Long Video Generation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28816","citing_title":"ASTRA: Mapping Art-Technology Institutions via Conceptual Axes, Text Embeddings, and Unsupervised Clustering","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13789","citing_title":"ENSEMBITS: an alphabet of protein conformational ensembles","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2310.16828","citing_title":"TD-MPC2: Scalable, Robust World Models for Continuous Control","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04974","citing_title":"From Video to Control: A Survey of Learning Manipulation Interfaces from Temporal Visual Data","ref_index":89,"is_internal_anchor":false},{"citing_arxiv_id":"2105.05233","citing_title":"Diffusion Models Beat GANs on Image Synthesis","ref_index":65,"is_internal_anchor":false},{"citing_arxiv_id":"2010.14701","citing_title":"Scaling Laws for Autoregressive Generative Modeling","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10046","citing_title":"PixelFlowCast: Latent-Free Precipitation Nowcasting via Pixel Mean Flows","ref_index":39,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09886","citing_title":"Network-Efficient World Model Token Streaming","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":56,"is_internal_anchor":false},{"citing_arxiv_id":"2604.21035","citing_title":"Masked-Token Prediction for Anomaly Detection at the Large Hadron Collider","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2604.10471","citing_title":"SID-Coord: Coordinating Semantic IDs for ID-based Ranking in Short-Video Search","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2501.09747","citing_title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","ref_index":60,"is_internal_anchor":false},{"citing_arxiv_id":"2112.10741","citing_title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models","ref_index":28,"is_internal_anchor":false},{"citing_arxiv_id":"2605.07230","citing_title":"CASCADE: Context-Aware Relaxation for Speculative Image Decoding","ref_index":45,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5","json":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5.json","graph_json":"https://pith.science/api/pith-number/3SHOO2AF5NF3QVQJG53NMPI7Q5/graph.json","events_json":"https://pith.science/api/pith-number/3SHOO2AF5NF3QVQJG53NMPI7Q5/events.json","paper":"https://pith.science/paper/3SHOO2AF"},"agent_actions":{"view_html":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5","download_json":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5.json","view_paper":"https://pith.science/paper/3SHOO2AF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1711.00937&json=true","fetch_graph":"https://pith.science/api/pith-number/3SHOO2AF5NF3QVQJG53NMPI7Q5/graph.json","fetch_events":"https://pith.science/api/pith-number/3SHOO2AF5NF3QVQJG53NMPI7Q5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5/action/storage_attestation","attest_author":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5/action/author_attestation","sign_citation":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5/action/citation_signature","submit_replication":"https://pith.science/pith/3SHOO2AF5NF3QVQJG53NMPI7Q5/action/replication_record"}},"created_at":"2026-05-18T00:14:40.797596+00:00","updated_at":"2026-05-18T00:14:40.797596+00:00"}