{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:S4RNNHS2J66LO45JQHFDNRTL7V","short_pith_number":"pith:S4RNNHS2","schema_version":"1.0","canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","source":{"kind":"arxiv","id":"2409.04429","version":3},"attestation_state":"computed","paper":{"title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Dacheng Li, Enze Xie, Haotian Tang, Hongxu Yin, Junyu Chen, Ligeng Zhu, Li Yi, Song Han, Yao Lu, Yecheng Wu, Yunhao Fang, Zhuoyang Zhang","submitted_at":"2024-09-06T17:49:56Z","abstract_excerpt":"VILA-U is a Unified foundation model that integrates Video, Image, Language understanding and generation. Traditional visual language models (VLMs) use separate modules for understanding and generating visual content, which can lead to misalignment and increased complexity. In contrast, VILA-U employs a single autoregressive next-token prediction framework for both tasks, eliminating the need for additional components like diffusion models. This approach not only simplifies the model but also achieves near state-of-the-art performance in visual language understanding and generation. The succes"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2409.04429","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-09-06T17:49:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"8f09cf539f61129398f14c71a553e592ce0ea211a778d0bc648039e95e389af0","abstract_canon_sha256":"8602611bef010053a6e1496c2659cf959a0e0989be198015128a4e05c92f1b2b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.625052Z","signature_b64":"gDc0LL0nB9A4hUp0CPfNF9Zbmf3Lv52fHK/EPXzKGNkkJ3txRMvGRtTPtFLyYhrsXENUyRTDkX1mWnT5hr9+CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","last_reissued_at":"2026-05-17T23:38:49.624534Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.624534Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Dacheng Li, Enze Xie, Haotian Tang, Hongxu Yin, Junyu Chen, Ligeng Zhu, Li Yi, Song Han, Yao Lu, Yecheng Wu, Yunhao Fang, Zhuoyang Zhang","submitted_at":"2024-09-06T17:49:56Z","abstract_excerpt":"VILA-U is a Unified foundation model that integrates Video, Image, Language understanding and generation. Traditional visual language models (VLMs) use separate modules for understanding and generating visual content, which can lead to misalignment and increased complexity. In contrast, VILA-U employs a single autoregressive next-token prediction framework for both tasks, eliminating the need for additional components like diffusion models. This approach not only simplifies the model but also achieves near state-of-the-art performance in visual language understanding and generation. The succes"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"VILA-U employs a single autoregressive next-token prediction framework for both visual understanding and generation tasks, eliminating the need for additional components like diffusion models while achieving near state-of-the-art performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That a unified vision tower can sufficiently align discrete visual tokens with textual inputs during pretraining and that autoregressive generation on a high-quality dataset can reach quality comparable to diffusion models without additional architectural components.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VILA-U unifies visual understanding and generation inside one autoregressive next-token prediction model, removing separate diffusion components while claiming near state-of-the-art results.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c2a294a74e21a8a096bc29ebdfc711b1c187b3205535372e6fbe2e7cbc2d0e77"},"source":{"id":"2409.04429","kind":"arxiv","version":3},"verdict":{"id":"f7c57f58-ba4e-4571-9810-cf96b3e530a9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T00:21:04.442936Z","strongest_claim":"VILA-U employs a single autoregressive next-token prediction framework for both visual understanding and generation tasks, eliminating the need for additional components like diffusion models while achieving near state-of-the-art performance.","one_line_summary":"VILA-U unifies visual understanding and generation inside one autoregressive next-token prediction model, removing separate diffusion components while claiming near state-of-the-art results.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That a unified vision tower can sufficiently align discrete visual tokens with textual inputs during pretraining and that autoregressive generation on a high-quality dataset can reach quality comparable to diffusion models without additional architectural components.","pith_extraction_headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework."},"references":{"count":29,"sample":[{"doi":"","year":null,"title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","work_id":"90e2b26a-3d27-4567-86b5-929b582a8034","ref_index":1,"cited_arxiv_id":"2311.12793","is_internal_anchor":true},{"doi":"","year":2023,"title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","work_id":"a7e00be2-37cb-4dde-b5e3-975e76648fac","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2009,"title":"Imagenet: A large-scale hierarchical image database","work_id":"45b681b2-db6a-46bf-803f-886e04ce487a","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Planting a seed of vision in large language model","work_id":"a97ecc74-b2ab-4837-bdc1-0a385272b7e9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Making the V in VQA matter: Elevating the role of image understanding in Visual Question Answering","work_id":"7762af17-d49a-44a1-812e-f1ea2c0e43bf","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":29,"snapshot_sha256":"8fd6b6e175162fcf3744aaa668f76fcc7a4740414e9d8411057bfa2bdb6fe6e3","internal_anchors":13},"formal_canon":{"evidence_count":3,"snapshot_sha256":"66789827c1913901e6a5c8360049ae59ae814e956e963ec2f15f95fbcb5903c6"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2409.04429","created_at":"2026-05-17T23:38:49.624620+00:00"},{"alias_kind":"arxiv_version","alias_value":"2409.04429v3","created_at":"2026-05-17T23:38:49.624620+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.04429","created_at":"2026-05-17T23:38:49.624620+00:00"},{"alias_kind":"pith_short_12","alias_value":"S4RNNHS2J66L","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"S4RNNHS2J66LO45J","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"S4RNNHS2","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":35,"internal_anchor_count":35,"sample":[{"citing_arxiv_id":"2507.23372","citing_title":"UniEmo: Unifying Emotional Understanding and Generation with Learnable Expert Queries","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21487","citing_title":"Uni-Edit: Intelligent Editing Is A General Task For Unified Model Tuning","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2503.14324","citing_title":"DualToken: Towards Unifying Visual Understanding and Generation with Dual Visual Vocabularies","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2505.17726","citing_title":"Slot-MLLM: Object-Centric Visual Tokenization for Multimodal LLM","ref_index":75,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18678","citing_title":"Lance: Unified Multimodal Modeling by Multi-Task Synergy","ref_index":132,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21487","citing_title":"Uni-Edit: Intelligent Editing Is A General Task For Unified Model Tuning","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18115","citing_title":"WinTok: A Win-Win Hybrid Tokenizer via Decomposing Visual Understanding and Generation with Transferable Tokens","ref_index":93,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18678","citing_title":"Lance: Unified Multimodal Modeling by Multi-Task Synergy","ref_index":131,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18714","citing_title":"Semantic Generative Tuning for Unified Multimodal Models","ref_index":75,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16961","citing_title":"Latent Action Control for Reasoning-Guided Unified Image Generation","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2509.21912","citing_title":"Discrete Guidance Matching: Exact Guidance for Discrete Flow Matching","ref_index":81,"is_internal_anchor":true},{"citing_arxiv_id":"2412.14164","citing_title":"MetaMorph: Multimodal Understanding and Generation via Instruction Tuning","ref_index":300,"is_internal_anchor":true},{"citing_arxiv_id":"2505.05472","citing_title":"Mogao: An Omni Foundation Model for Interleaved Multi-Modal Generation","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2511.22663","citing_title":"AIA: Rethinking Architecture Decoupling Strategy In Unified Multimodal Model","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10941","citing_title":"Mull-Tokens: Modality-Agnostic Latent Thinking","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2602.06663","citing_title":"PlanViz: Evaluating Planning-Oriented Image Generation and Editing for Computer-Use Tasks","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2503.22020","citing_title":"CoT-VLA: Visual Chain-of-Thought Reasoning for Vision-Language-Action Models","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2404.14396","citing_title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension and Generation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2410.13848","citing_title":"Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation","ref_index":85,"is_internal_anchor":true},{"citing_arxiv_id":"2503.10631","citing_title":"HybridVLA: Collaborative Diffusion and Autoregression in a Unified Vision-Language-Action Model","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2503.07265","citing_title":"WISE: A World Knowledge-Informed Semantic Evaluation for Text-to-Image Generation","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2505.15809","citing_title":"MMaDA: Multimodal Large Diffusion Language Models","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12500","citing_title":"SenseNova-U1: Unifying Multimodal Understanding and Generation with NEO-unify Architecture","ref_index":143,"is_internal_anchor":true},{"citing_arxiv_id":"2506.15564","citing_title":"Show-o2: Improved Native Unified Multimodal Models","ref_index":123,"is_internal_anchor":true},{"citing_arxiv_id":"2505.09568","citing_title":"BLIP3-o: A Family of Fully Open Unified Multimodal Models-Architecture, Training and Dataset","ref_index":36,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":3,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V","json":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V.json","graph_json":"https://pith.science/api/pith-number/S4RNNHS2J66LO45JQHFDNRTL7V/graph.json","events_json":"https://pith.science/api/pith-number/S4RNNHS2J66LO45JQHFDNRTL7V/events.json","paper":"https://pith.science/paper/S4RNNHS2"},"agent_actions":{"view_html":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V","download_json":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V.json","view_paper":"https://pith.science/paper/S4RNNHS2","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2409.04429&json=true","fetch_graph":"https://pith.science/api/pith-number/S4RNNHS2J66LO45JQHFDNRTL7V/graph.json","fetch_events":"https://pith.science/api/pith-number/S4RNNHS2J66LO45JQHFDNRTL7V/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/action/timestamp_anchor","attest_storage":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/action/storage_attestation","attest_author":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/action/author_attestation","sign_citation":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/action/citation_signature","submit_replication":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/action/replication_record"}},"created_at":"2026-05-17T23:38:49.624620+00:00","updated_at":"2026-05-17T23:38:49.624620+00:00"}