{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2020:YGSCZ6HP7THERR5I76AYIWH2JK","short_pith_number":"pith:YGSCZ6HP","schema_version":"1.0","canonical_sha256":"c1a42cf8effcce48c7a8ff818458fa4a8ef6bb986ef5203b36d9700e2d24e41f","source":{"kind":"arxiv","id":"2005.00341","version":1},"attestation_state":"computed","paper":{"title":"Jukebox: A Generative Model for Music","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.SD","stat.ML"],"primary_cat":"eess.AS","authors_text":"Alec Radford, Christine Payne, Heewoo Jun, Ilya Sutskever, Jong Wook Kim, Prafulla Dhariwal","submitted_at":"2020-04-30T09:02:45Z","abstract_excerpt":"We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multi-scale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples at https://jukebox.openai.com, along "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2005.00341","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"eess.AS","submitted_at":"2020-04-30T09:02:45Z","cross_cats_sorted":["cs.LG","cs.SD","stat.ML"],"title_canon_sha256":"21679c906f6aa7884006a458a5dcffcad1679832075f09590e56e6c46b508463","abstract_canon_sha256":"fd9e76e17eb4a9535737c3466ddd7f7102684fa5c1e83cba7b2986a4268bcdfd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:22.060939Z","signature_b64":"RS923LwBM85ZXj7bzBZ9CWPgLFVLKBI8eGZ5UcdG633bun5d9VSg8T+kq2UJ4cZiigZE5vLeasWl93h9LT2qDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c1a42cf8effcce48c7a8ff818458fa4a8ef6bb986ef5203b36d9700e2d24e41f","last_reissued_at":"2026-05-17T23:39:22.060196Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:22.060196Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Jukebox: A Generative Model for Music","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.SD","stat.ML"],"primary_cat":"eess.AS","authors_text":"Alec Radford, Christine Payne, Heewoo Jun, Ilya Sutskever, Jong Wook Kim, Prafulla Dhariwal","submitted_at":"2020-04-30T09:02:45Z","abstract_excerpt":"We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multi-scale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples at https://jukebox.openai.com, along "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2005.00341","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2005.00341","created_at":"2026-05-17T23:39:22.060329+00:00"},{"alias_kind":"arxiv_version","alias_value":"2005.00341v1","created_at":"2026-05-17T23:39:22.060329+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2005.00341","created_at":"2026-05-17T23:39:22.060329+00:00"},{"alias_kind":"pith_short_12","alias_value":"YGSCZ6HP7THE","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"YGSCZ6HP7THERR5I","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"YGSCZ6HP","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":40,"internal_anchor_count":15,"sample":[{"citing_arxiv_id":"2502.18309","citing_title":"GCDance: Genre-Controlled Music-Driven 3D Full Body Dance Generation","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2505.08203","citing_title":"Not that Groove: Zero-Shot Symbolic Music Editing","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2512.01537","citing_title":"Two-Dimensional Quantization for Geometry-Aware Audio Coding","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2603.03190","citing_title":"Expectation and Acoustic Neural Network Representations Enhance Music Identification from Brain Activity","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2505.24437","citing_title":"SwitchCodec: A High-Fidelity Nerual Audio Codec With Sparse Quantization","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2211.15089","citing_title":"Continuous diffusion for categorical data","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2309.15505","citing_title":"Finite Scalar Quantization: VQ-VAE Made Simple","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2601.03612","citing_title":"Mathematical Foundations of Polyphonic Music Generation via Structural Inductive Bias","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2305.02463","citing_title":"Shap-E: Generating Conditional 3D Implicit Functions","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2412.02612","citing_title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2602.22029","citing_title":"MIDI-Informed Singing Accompaniment Generation in a Compositional Song Pipeline","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2603.07956","citing_title":"From Daily Song to Daily Self: Supporting Reflective Songwriting of Deaf and Hard-of-Hearing Individuals through Generative Music AI","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2211.15657","citing_title":"Is Conditional Generative Modeling all you need for Decision-Making?","ref_index":195,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13789","citing_title":"ENSEMBITS: an alphabet of protein conformational ensembles","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13789","citing_title":"ENSEMBITS: an alphabet of protein conformational ensembles","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.03310","citing_title":"Diffusion Path Alignment for Long-Range Motion Generation and Domain Transitions","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2210.13438","citing_title":"High Fidelity Neural Audio Compression","ref_index":11,"is_internal_anchor":false},{"citing_arxiv_id":"2104.10157","citing_title":"VideoGPT: Video Generation using VQ-VAE and Transformers","ref_index":11,"is_internal_anchor":false},{"citing_arxiv_id":"2301.11325","citing_title":"MusicLM: Generating Music From Text","ref_index":6,"is_internal_anchor":false},{"citing_arxiv_id":"2105.05233","citing_title":"Diffusion Models Beat GANs on Image Synthesis","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2010.14701","citing_title":"Scaling Laws for Autoregressive Generative Modeling","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06870","citing_title":"Continuous First, Discrete Later: VQ-VAEs Without Dimensional Collapse","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2207.04672","citing_title":"No Language Left Behind: Scaling Human-Centered Machine Translation","ref_index":8,"is_internal_anchor":false},{"citing_arxiv_id":"2605.03929","citing_title":"PHALAR: Phasors for Learned Musical Audio Representations","ref_index":39,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK","json":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK.json","graph_json":"https://pith.science/api/pith-number/YGSCZ6HP7THERR5I76AYIWH2JK/graph.json","events_json":"https://pith.science/api/pith-number/YGSCZ6HP7THERR5I76AYIWH2JK/events.json","paper":"https://pith.science/paper/YGSCZ6HP"},"agent_actions":{"view_html":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK","download_json":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK.json","view_paper":"https://pith.science/paper/YGSCZ6HP","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2005.00341&json=true","fetch_graph":"https://pith.science/api/pith-number/YGSCZ6HP7THERR5I76AYIWH2JK/graph.json","fetch_events":"https://pith.science/api/pith-number/YGSCZ6HP7THERR5I76AYIWH2JK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK/action/storage_attestation","attest_author":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK/action/author_attestation","sign_citation":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK/action/citation_signature","submit_replication":"https://pith.science/pith/YGSCZ6HP7THERR5I76AYIWH2JK/action/replication_record"}},"created_at":"2026-05-17T23:39:22.060329+00:00","updated_at":"2026-05-17T23:39:22.060329+00:00"}