{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:NOE5TNB5M3JEXV2H2SFPTBCYBU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"37ebe6436300552155727322e1e80dae910b976364d4b35c782a49b466a7c6a3","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-08T17:58:57Z","title_canon_sha256":"2901d2f3db0f233e2de4c877ae6dbb6888068a827bb4c6b701deff4db654f564"},"schema_version":"1.0","source":{"id":"2505.05472","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2505.05472","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2505.05472v2","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.05472","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"NOE5TNB5M3JE","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NOE5TNB5M3JEXV2H","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NOE5TNB5","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:01f379633abed6b146a9b6553dd584569a4f21b8f7ccff58619833c472893885","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Mogao not only achieves state-of-the-art performance in multi-modal understanding and text-to-image generation, but also excels in producing high-quality, coherent interleaved outputs. Its emergent capabilities in zero-shot image editing and compositional generation highlight Mogao as a practical omni-modal foundation model."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The key technical improvements (deep-fusion design, dual vision encoders, interleaved rotary position embeddings, and multi-modal classifier-free guidance) successfully combine the strengths of autoregressive text models and diffusion image models for arbitrary interleaved sequences."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Mogao presents a causal unified model with deep fusion, dual encoders, and interleaved position embeddings that achieves strong performance on multi-modal understanding, text-to-image generation, and coherent interleaved outputs including zero-shot editing."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Mogao is a single model that generates arbitrary sequences mixing text and images by fusing autoregressive and diffusion components."}],"snapshot_sha256":"bf1f45e253a85302aa1d5040e96a7df07fd3449de3d7b21a854d1d16a302836a"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"b7903a9725fa117ff04df9c21506486f30477dfc034a2c0d69cced73bf9617af"},"paper":{"abstract_excerpt":"Recent progress in unified models for image understanding and generation has been impressive, yet most approaches remain limited to single-modal generation conditioned on multiple modalities. In this paper, we present Mogao, a unified framework that advances this paradigm by enabling interleaved multi-modal generation through a causal approach. Mogao integrates a set of key technical improvements in architecture design, including a deep-fusion design, dual vision encoders, interleaved rotary position embeddings, and multi-modal classifier-free guidance, which allow it to harness the strengths ","authors_text":"Chao Liao, Jie Wu, Liang Li, Liyang Liu, Weilin Huang, Wenliang Zhao, Xinyu Zhang, Xun Wang, Zhengxiong Luo, Zhi Tian","cross_cats":[],"headline":"Mogao is a single model that generates arbitrary sequences mixing text and images by fusing autoregressive and diffusion components.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-08T17:58:57Z","title":"Mogao: An Omni Foundation Model for Interleaved Multi-Modal Generation"},"references":{"count":99,"internal_anchors":37,"resolved_work":99,"sample":[{"cited_arxiv_id":"2410.07073","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Pixtral 12b.arXiv preprint arXiv:2410.07073, 2024","work_id":"9ad2b071-82d8-4cfa-b994-b9975094b575","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Flamingo: a visual language model for few-shot learning","work_id":"b61d581d-a5f8-4799-a638-d91ddfc06da4","year":2022},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Improving image generation with better captions.Computer Science","work_id":"fb7509a6-ece6-4ea3-b583-1ec884016dc8","year":2023}],"snapshot_sha256":"08c6054537d19ec7e98dafbb8ea80acc44e066f2b817c034c113ca16062ba8d0"},"source":{"id":"2505.05472","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T07:18:40.194337Z","id":"e4dc66b9-5e09-4357-a230-27a0a06abfe0","model_set":{"reader":"grok-4.3"},"one_line_summary":"Mogao presents a causal unified model with deep fusion, dual encoders, and interleaved position embeddings that achieves strong performance on multi-modal understanding, text-to-image generation, and coherent interleaved outputs including zero-shot editing.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Mogao is a single model that generates arbitrary sequences mixing text and images by fusing autoregressive and diffusion components.","strongest_claim":"Mogao not only achieves state-of-the-art performance in multi-modal understanding and text-to-image generation, but also excels in producing high-quality, coherent interleaved outputs. Its emergent capabilities in zero-shot image editing and compositional generation highlight Mogao as a practical omni-modal foundation model.","weakest_assumption":"The key technical improvements (deep-fusion design, dual vision encoders, interleaved rotary position embeddings, and multi-modal classifier-free guidance) successfully combine the strengths of autoregressive text models and diffusion image models for arbitrary interleaved sequences."}},"verdict_id":"e4dc66b9-5e09-4357-a230-27a0a06abfe0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7f0daefd77a1eba2bd30accb45448d21592fbeaaf3c5b7185d0f42e13956e6ba","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"37ebe6436300552155727322e1e80dae910b976364d4b35c782a49b466a7c6a3","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-08T17:58:57Z","title_canon_sha256":"2901d2f3db0f233e2de4c877ae6dbb6888068a827bb4c6b701deff4db654f564"},"schema_version":"1.0","source":{"id":"2505.05472","kind":"arxiv","version":2}},"canonical_sha256":"6b89d9b43d66d24bd747d48af984580d2c2450ff9b113ef15376397d8b9489be","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6b89d9b43d66d24bd747d48af984580d2c2450ff9b113ef15376397d8b9489be","first_computed_at":"2026-05-17T23:38:14.723810Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.723810Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wYht+DcF7TL3muhtTq8LiZwhCt3OG0ublijDJ8XtZYV8b8/ZU6bAc8qAD4UA/F0PRojosopJp9+UAVgsebOfAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.724379Z","signed_message":"canonical_sha256_bytes"},"source_id":"2505.05472","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7f0daefd77a1eba2bd30accb45448d21592fbeaaf3c5b7185d0f42e13956e6ba","sha256:01f379633abed6b146a9b6553dd584569a4f21b8f7ccff58619833c472893885"],"state_sha256":"144f2c4d68e25a87f29a71a02e00c8d34fbda80322b5f3d8b387a434a4604927"}