{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:LKVB4U57LAQJSHMPPXTP3IJP7V","short_pith_number":"pith:LKVB4U57","schema_version":"1.0","canonical_sha256":"5aaa1e53bf5820991d8f7de6fda12ffd5cfecb6fa8642e8ece94b926dded7a67","source":{"kind":"arxiv","id":"2402.15391","version":1},"attestation_state":"computed","paper":{"title":"Genie: Generative Interactive Environments","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.LG","authors_text":"Aditi Mavalankar, Ashley Edwards, Chris Apps, Edward Hughes, Feryal Behbahani, Jack Parker-Holder, Jake Bruce, Jeff Clune, Jingwei Zhang, Konrad Zolna, Lucy Gonzalez, Matthew Lai, Michael Dennis, Nando de Freitas, Nicolas Heess, Richie Steigerwald, Sarah Bechtle, Satinder Singh, Scott Reed, Sherjil Ozair, Simon Osindero, Stephanie Chan, Tim Rockt\\\"aschel, Yuge Shi, Yusuf Aytar","submitted_at":"2024-02-23T15:47:26Z","abstract_excerpt":"We introduce Genie, the first generative interactive environment trained in an unsupervised manner from unlabelled Internet videos. The model can be prompted to generate an endless variety of action-controllable virtual worlds described through text, synthetic images, photographs, and even sketches. At 11B parameters, Genie can be considered a foundation world model. It is comprised of a spatiotemporal video tokenizer, an autoregressive dynamics model, and a simple and scalable latent action model. Genie enables users to act in the generated environments on a frame-by-frame basis despite train"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2402.15391","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-02-23T15:47:26Z","cross_cats_sorted":["cs.AI","cs.CV"],"title_canon_sha256":"ceb1d8e544bf068d31145289572b33990ace023caf700f08f2fc0fb95aa81f14","abstract_canon_sha256":"f00f7109f40f4d8604d74af369e761fc4e67230f3a4f2937e01b5284e1bf71dc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T07:48:39.886202Z","signature_b64":"ENN/Vtot0/qx51opVh7a+TEHbaVjp8nUlAgmhS3NfS/I8pd26prAfBauQlLPakB5EeE+ptx73u/xWnqohlnsDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5aaa1e53bf5820991d8f7de6fda12ffd5cfecb6fa8642e8ece94b926dded7a67","last_reissued_at":"2026-07-05T07:48:39.885834Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T07:48:39.885834Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Genie: Generative Interactive Environments","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.LG","authors_text":"Aditi Mavalankar, Ashley Edwards, Chris Apps, Edward Hughes, Feryal Behbahani, Jack Parker-Holder, Jake Bruce, Jeff Clune, Jingwei Zhang, Konrad Zolna, Lucy Gonzalez, Matthew Lai, Michael Dennis, Nando de Freitas, Nicolas Heess, Richie Steigerwald, Sarah Bechtle, Satinder Singh, Scott Reed, Sherjil Ozair, Simon Osindero, Stephanie Chan, Tim Rockt\\\"aschel, Yuge Shi, Yusuf Aytar","submitted_at":"2024-02-23T15:47:26Z","abstract_excerpt":"We introduce Genie, the first generative interactive environment trained in an unsupervised manner from unlabelled Internet videos. The model can be prompted to generate an endless variety of action-controllable virtual worlds described through text, synthetic images, photographs, and even sketches. At 11B parameters, Genie can be considered a foundation world model. It is comprised of a spatiotemporal video tokenizer, an autoregressive dynamics model, and a simple and scalable latent action model. Genie enables users to act in the generated environments on a frame-by-frame basis despite train"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2402.15391","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2402.15391/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2402.15391","created_at":"2026-07-05T07:48:39.885891+00:00"},{"alias_kind":"arxiv_version","alias_value":"2402.15391v1","created_at":"2026-07-05T07:48:39.885891+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2402.15391","created_at":"2026-07-05T07:48:39.885891+00:00"},{"alias_kind":"pith_short_12","alias_value":"LKVB4U57LAQJ","created_at":"2026-07-05T07:48:39.885891+00:00"},{"alias_kind":"pith_short_16","alias_value":"LKVB4U57LAQJSHMP","created_at":"2026-07-05T07:48:39.885891+00:00"},{"alias_kind":"pith_short_8","alias_value":"LKVB4U57","created_at":"2026-07-05T07:48:39.885891+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":33,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.24669","citing_title":"LaGO: Latent Action Guidance for Online Reinforcement Learning","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2606.21139","citing_title":"PoLAR: Factorizing Extent and Mode in Latent Actions for Robot Policy Learning","ref_index":4,"is_internal_anchor":false},{"citing_arxiv_id":"2606.18828","citing_title":"Space Is Intelligence: Neural Semigroup Superposition for Riemannian Metric Generation","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2606.18558","citing_title":"MolmoMotion: Forecasting Point Trajectories in 3D with Language Instruction","ref_index":11,"is_internal_anchor":false},{"citing_arxiv_id":"2606.18697","citing_title":"Stealthy World Model Manipulation via Data Poisoning","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2607.02075","citing_title":"HandsOnWorld: Unconstrained Egocentric Video Generation with Camera-Disentangled Hand Control","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2606.13053","citing_title":"EA-WM: Event-Aware World Models with Task-Specification Grounding for Long-Horizon Manipulation","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2606.12783","citing_title":"A Tutorial on World Models and Physical AI","ref_index":7,"is_internal_anchor":false},{"citing_arxiv_id":"2606.12403","citing_title":"World Pilot: Steering Vision-Language-Action Models with World-Action Priors","ref_index":31,"is_internal_anchor":false},{"citing_arxiv_id":"2607.00673","citing_title":"Path Planning in Physically Viable World Models","ref_index":24,"is_internal_anchor":false},{"citing_arxiv_id":"2606.02486","citing_title":"Intercepting the Future: Latent-Space Predictive World Model for Dynamic VLA Manipulation","ref_index":12,"is_internal_anchor":false},{"citing_arxiv_id":"2606.02586","citing_title":"Fewer, Better Frames: A Compute-Normalized Proof of Concept for Coherence-First World-Model Rendering with Model-Guided FSR4 Frame Generation","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2606.30292","citing_title":"DreamForge-World 0.1 Preview: A Low-Compute Real-Time Controllable World Model","ref_index":2,"is_internal_anchor":false},{"citing_arxiv_id":"2606.00133","citing_title":"World Models: A Comprehensive Survey of Architectures, Methodologies, Reasoning Paradigms, and Applications","ref_index":13,"is_internal_anchor":false},{"citing_arxiv_id":"2605.29360","citing_title":"MiraBench: Evaluating Action-Conditioned Reliability in Robotic World Models","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2605.20223","citing_title":"Why Latent Actions Fail, and How to Prevent It","ref_index":6,"is_internal_anchor":false},{"citing_arxiv_id":"2605.20299","citing_title":"Mechanisms of Misgeneralization in Physical Sequence Modeling","ref_index":150,"is_internal_anchor":false},{"citing_arxiv_id":"2406.03520","citing_title":"VideoPhy: Evaluating Physical Commonsense for Video Generation","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"2411.04983","citing_title":"DINO-WM: World Models on Pre-trained Visual Features enable Zero-shot Planning","ref_index":7,"is_internal_anchor":false},{"citing_arxiv_id":"2505.15659","citing_title":"FLARE: Robot Learning with Implicit World Modeling","ref_index":59,"is_internal_anchor":false},{"citing_arxiv_id":"2408.14837","citing_title":"Diffusion Models Are Real-Time Game Engines","ref_index":61,"is_internal_anchor":false},{"citing_arxiv_id":"2601.23286","citing_title":"VideoGPA: Distilling Geometry Priors for 3D-Consistent Video Generation","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2602.02958","citing_title":"Quant VideoGen: Auto-Regressive Long Video Generation via 2-Bit KV-Cache Quantization","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2505.12705","citing_title":"DreamGen: Unlocking Generalization in Robot Learning through Video World Models","ref_index":71,"is_internal_anchor":false},{"citing_arxiv_id":"2602.20231","citing_title":"UniLACT: Depth-Aware RGB Latent Action Learning for Vision-Language-Action Models","ref_index":6,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V","json":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V.json","graph_json":"https://pith.science/api/pith-number/LKVB4U57LAQJSHMPPXTP3IJP7V/graph.json","events_json":"https://pith.science/api/pith-number/LKVB4U57LAQJSHMPPXTP3IJP7V/events.json","paper":"https://pith.science/paper/LKVB4U57"},"agent_actions":{"view_html":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V","download_json":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V.json","view_paper":"https://pith.science/paper/LKVB4U57","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2402.15391&json=true","fetch_graph":"https://pith.science/api/pith-number/LKVB4U57LAQJSHMPPXTP3IJP7V/graph.json","fetch_events":"https://pith.science/api/pith-number/LKVB4U57LAQJSHMPPXTP3IJP7V/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V/action/storage_attestation","attest_author":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V/action/author_attestation","sign_citation":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V/action/citation_signature","submit_replication":"https://pith.science/pith/LKVB4U57LAQJSHMPPXTP3IJP7V/action/replication_record"}},"created_at":"2026-07-05T07:48:39.885891+00:00","updated_at":"2026-07-05T07:48:39.885891+00:00"}