{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:S4RNNHS2J66LO45JQHFDNRTL7V","short_pith_number":"pith:S4RNNHS2","canonical_record":{"source":{"id":"2409.04429","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-09-06T17:49:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"8f09cf539f61129398f14c71a553e592ce0ea211a778d0bc648039e95e389af0","abstract_canon_sha256":"8602611bef010053a6e1496c2659cf959a0e0989be198015128a4e05c92f1b2b"},"schema_version":"1.0"},"canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","source":{"kind":"arxiv","id":"2409.04429","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2409.04429","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2409.04429v3","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.04429","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"S4RNNHS2J66L","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"S4RNNHS2J66LO45J","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"S4RNNHS2","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:S4RNNHS2J66LO45JQHFDNRTL7V","target":"record","payload":{"canonical_record":{"source":{"id":"2409.04429","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-09-06T17:49:56Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"8f09cf539f61129398f14c71a553e592ce0ea211a778d0bc648039e95e389af0","abstract_canon_sha256":"8602611bef010053a6e1496c2659cf959a0e0989be198015128a4e05c92f1b2b"},"schema_version":"1.0"},"canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.625052Z","signature_b64":"gDc0LL0nB9A4hUp0CPfNF9Zbmf3Lv52fHK/EPXzKGNkkJ3txRMvGRtTPtFLyYhrsXENUyRTDkX1mWnT5hr9+CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","last_reissued_at":"2026-05-17T23:38:49.624534Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.624534Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2409.04429","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"r80eYJzDZQ7k1+A5BC7hlrFx+5Km6YHQBj/U+RlBgP3hjhRnJcfZUr5o7uiePDbENkRjXF6A587zAXfr04MKCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T03:51:18.123986Z"},"content_sha256":"27f1fd342f34c4bfc8827dd93b5eb6915b6d26315c7ef585aefb17c60a063689","schema_version":"1.0","event_id":"sha256:27f1fd342f34c4bfc8827dd93b5eb6915b6d26315c7ef585aefb17c60a063689"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:S4RNNHS2J66LO45JQHFDNRTL7V","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Dacheng Li, Enze Xie, Haotian Tang, Hongxu Yin, Junyu Chen, Ligeng Zhu, Li Yi, Song Han, Yao Lu, Yecheng Wu, Yunhao Fang, Zhuoyang Zhang","submitted_at":"2024-09-06T17:49:56Z","abstract_excerpt":"VILA-U is a Unified foundation model that integrates Video, Image, Language understanding and generation. Traditional visual language models (VLMs) use separate modules for understanding and generating visual content, which can lead to misalignment and increased complexity. In contrast, VILA-U employs a single autoregressive next-token prediction framework for both tasks, eliminating the need for additional components like diffusion models. This approach not only simplifies the model but also achieves near state-of-the-art performance in visual language understanding and generation. The succes"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"VILA-U employs a single autoregressive next-token prediction framework for both visual understanding and generation tasks, eliminating the need for additional components like diffusion models while achieving near state-of-the-art performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That a unified vision tower can sufficiently align discrete visual tokens with textual inputs during pretraining and that autoregressive generation on a high-quality dataset can reach quality comparable to diffusion models without additional architectural components.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VILA-U unifies visual understanding and generation inside one autoregressive next-token prediction model, removing separate diffusion components while claiming near state-of-the-art results.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c2a294a74e21a8a096bc29ebdfc711b1c187b3205535372e6fbe2e7cbc2d0e77"},"source":{"id":"2409.04429","kind":"arxiv","version":3},"verdict":{"id":"f7c57f58-ba4e-4571-9810-cf96b3e530a9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T00:21:04.442936Z","strongest_claim":"VILA-U employs a single autoregressive next-token prediction framework for both visual understanding and generation tasks, eliminating the need for additional components like diffusion models while achieving near state-of-the-art performance.","one_line_summary":"VILA-U unifies visual understanding and generation inside one autoregressive next-token prediction model, removing separate diffusion components while claiming near state-of-the-art results.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That a unified vision tower can sufficiently align discrete visual tokens with textual inputs during pretraining and that autoregressive generation on a high-quality dataset can reach quality comparable to diffusion models without additional architectural components.","pith_extraction_headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework."},"references":{"count":29,"sample":[{"doi":"","year":null,"title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","work_id":"90e2b26a-3d27-4567-86b5-929b582a8034","ref_index":1,"cited_arxiv_id":"2311.12793","is_internal_anchor":true},{"doi":"","year":2023,"title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","work_id":"a7e00be2-37cb-4dde-b5e3-975e76648fac","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2009,"title":"Imagenet: A large-scale hierarchical image database","work_id":"45b681b2-db6a-46bf-803f-886e04ce487a","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Planting a seed of vision in large language model","work_id":"a97ecc74-b2ab-4837-bdc1-0a385272b7e9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Making the V in VQA matter: Elevating the role of image understanding in Visual Question Answering","work_id":"7762af17-d49a-44a1-812e-f1ea2c0e43bf","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":29,"snapshot_sha256":"8fd6b6e175162fcf3744aaa668f76fcc7a4740414e9d8411057bfa2bdb6fe6e3","internal_anchors":13},"formal_canon":{"evidence_count":3,"snapshot_sha256":"66789827c1913901e6a5c8360049ae59ae814e956e963ec2f15f95fbcb5903c6"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"f7c57f58-ba4e-4571-9810-cf96b3e530a9"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"rog/troSSHW/JALDkYgdXgnOqqUoddIV9B7/UApIcPJ14ehtoFYbF2c0NzCa7ihcD+n1DAWXBvNaac2aK4skBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T03:51:18.124981Z"},"content_sha256":"c0928887f1ceb51b407d56d50316caeea20d1ae805bf982ed515ec26747a4ca3","schema_version":"1.0","event_id":"sha256:c0928887f1ceb51b407d56d50316caeea20d1ae805bf982ed515ec26747a4ca3"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/bundle.json","state_url":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/S4RNNHS2J66LO45JQHFDNRTL7V/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T03:51:18Z","links":{"resolver":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V","bundle":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/bundle.json","state":"https://pith.science/pith/S4RNNHS2J66LO45JQHFDNRTL7V/state.json","well_known_bundle":"https://pith.science/.well-known/pith/S4RNNHS2J66LO45JQHFDNRTL7V/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:S4RNNHS2J66LO45JQHFDNRTL7V","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"8602611bef010053a6e1496c2659cf959a0e0989be198015128a4e05c92f1b2b","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-09-06T17:49:56Z","title_canon_sha256":"8f09cf539f61129398f14c71a553e592ce0ea211a778d0bc648039e95e389af0"},"schema_version":"1.0","source":{"id":"2409.04429","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2409.04429","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2409.04429v3","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.04429","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"S4RNNHS2J66L","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"S4RNNHS2J66LO45J","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"S4RNNHS2","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c0928887f1ceb51b407d56d50316caeea20d1ae805bf982ed515ec26747a4ca3","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"VILA-U employs a single autoregressive next-token prediction framework for both visual understanding and generation tasks, eliminating the need for additional components like diffusion models while achieving near state-of-the-art performance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That a unified vision tower can sufficiently align discrete visual tokens with textual inputs during pretraining and that autoregressive generation on a high-quality dataset can reach quality comparable to diffusion models without additional architectural components."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VILA-U unifies visual understanding and generation inside one autoregressive next-token prediction model, removing separate diffusion components while claiming near state-of-the-art results."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework."}],"snapshot_sha256":"c2a294a74e21a8a096bc29ebdfc711b1c187b3205535372e6fbe2e7cbc2d0e77"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"66789827c1913901e6a5c8360049ae59ae814e956e963ec2f15f95fbcb5903c6"},"paper":{"abstract_excerpt":"VILA-U is a Unified foundation model that integrates Video, Image, Language understanding and generation. Traditional visual language models (VLMs) use separate modules for understanding and generating visual content, which can lead to misalignment and increased complexity. In contrast, VILA-U employs a single autoregressive next-token prediction framework for both tasks, eliminating the need for additional components like diffusion models. This approach not only simplifies the model but also achieves near state-of-the-art performance in visual language understanding and generation. The succes","authors_text":"Dacheng Li, Enze Xie, Haotian Tang, Hongxu Yin, Junyu Chen, Ligeng Zhu, Li Yi, Song Han, Yao Lu, Yecheng Wu, Yunhao Fang, Zhuoyang Zhang","cross_cats":["cs.LG"],"headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-09-06T17:49:56Z","title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and Generation"},"references":{"count":29,"internal_anchors":13,"resolved_work":29,"sample":[{"cited_arxiv_id":"2311.12793","doi":"","is_internal_anchor":true,"ref_index":1,"title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","work_id":"90e2b26a-3d27-4567-86b5-929b582a8034","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","work_id":"a7e00be2-37cb-4dde-b5e3-975e76648fac","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Imagenet: A large-scale hierarchical image database","work_id":"45b681b2-db6a-46bf-803f-886e04ce487a","year":2009},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Planting a seed of vision in large language model","work_id":"a97ecc74-b2ab-4837-bdc1-0a385272b7e9","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Making the V in VQA matter: Elevating the role of image understanding in Visual Question Answering","work_id":"7762af17-d49a-44a1-812e-f1ea2c0e43bf","year":2025}],"snapshot_sha256":"8fd6b6e175162fcf3744aaa668f76fcc7a4740414e9d8411057bfa2bdb6fe6e3"},"source":{"id":"2409.04429","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-16T00:21:04.442936Z","id":"f7c57f58-ba4e-4571-9810-cf96b3e530a9","model_set":{"reader":"grok-4.3"},"one_line_summary":"VILA-U unifies visual understanding and generation inside one autoregressive next-token prediction model, removing separate diffusion components while claiming near state-of-the-art results.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"VILA-U integrates visual understanding and generation using a single autoregressive next-token prediction framework.","strongest_claim":"VILA-U employs a single autoregressive next-token prediction framework for both visual understanding and generation tasks, eliminating the need for additional components like diffusion models while achieving near state-of-the-art performance.","weakest_assumption":"That a unified vision tower can sufficiently align discrete visual tokens with textual inputs during pretraining and that autoregressive generation on a high-quality dataset can reach quality comparable to diffusion models without additional architectural components."}},"verdict_id":"f7c57f58-ba4e-4571-9810-cf96b3e530a9"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:27f1fd342f34c4bfc8827dd93b5eb6915b6d26315c7ef585aefb17c60a063689","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"8602611bef010053a6e1496c2659cf959a0e0989be198015128a4e05c92f1b2b","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-09-06T17:49:56Z","title_canon_sha256":"8f09cf539f61129398f14c71a553e592ce0ea211a778d0bc648039e95e389af0"},"schema_version":"1.0","source":{"id":"2409.04429","kind":"arxiv","version":3}},"canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"9722d69e5a4fbcb773a981ca36c66bfd79ef74d71431dba298615260d16f17de","first_computed_at":"2026-05-17T23:38:49.624534Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.624534Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"gDc0LL0nB9A4hUp0CPfNF9Zbmf3Lv52fHK/EPXzKGNkkJ3txRMvGRtTPtFLyYhrsXENUyRTDkX1mWnT5hr9+CQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.625052Z","signed_message":"canonical_sha256_bytes"},"source_id":"2409.04429","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:27f1fd342f34c4bfc8827dd93b5eb6915b6d26315c7ef585aefb17c60a063689","sha256:c0928887f1ceb51b407d56d50316caeea20d1ae805bf982ed515ec26747a4ca3"],"state_sha256":"0fc107df7778873a1b64b538add5b2e5795dc46060a09aada4332afade2b704c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LVJ3Huqz9DugIFL98/aT2nqzy57rXAXtsMvWXpSxggHbn5PDSm9U0T92pGFU7kPE3qpA65gDUES4DvkjVVvoBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T03:51:18.128874Z","bundle_sha256":"53f1d265b8ac2febd1bafc5e6f53b4309baf0bfff4788aea31e734360ff00a96"}}