{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:ID7PWW53VUCANRI32L4VIQM256","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"46600ae81117b6ccc061a48ecab7a9f32c222fa5a904f09d6e372d0982f5c569","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","title_canon_sha256":"0bc9d5de0efbc6d14eda4e338377e6441a0a00c832fa1e567e5af7edca242882"},"schema_version":"1.0","source":{"id":"2307.05222","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.05222","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2307.05222v2","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.05222","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"ID7PWW53VUCA","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"ID7PWW53VUCANRI3","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"ID7PWW53","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:9b066a2d3582e522c203bb402db6068975a698da16a7a3a99b94315d6fa06f29","target":"graph","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across a broad range of zero-shot/few-shot tasks including image captioning, visual question answering, video question answering and text-to-image generation, Emu demonstrates superb performance compared to state-of-the-art large multimodal models."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That encoding visual signals into embeddings and training with a unified next-token or next-embedding objective will produce coherent multimodal generation without modality-specific losses or architectures."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Emu is a multimodal foundation model that unifies image and text generation via autoregressive pretraining on interleaved multimodal data, showing strong zero-shot performance on captioning, VQA, and text-to-image tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs."}],"snapshot_sha256":"5072c2a8cf7419115bf0b0457f33763a63dff986b79e3cb73c852a5e32e703b3"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"455f3d72a31ca83ed0371ac902779939b0be8d90400205b164a5df3a1801ccea"},"paper":{"abstract_excerpt":"We present Emu, a Transformer-based multimodal foundation model, which can seamlessly generate images and texts in multimodal context. This omnivore model can take in any single-modality or multimodal data input indiscriminately (e.g., interleaved image, text and video) through a one-model-for-all autoregressive training process. First, visual signals are encoded into embeddings, and together with text tokens form an interleaved input sequence. Emu is then end-to-end trained with a unified objective of classifying the next text token or regressing the next visual embedding in the multimodal se","authors_text":"Fan Zhang, Hongcheng Gao, Jingjing Liu, Qiying Yu, Quan Sun, Tiejun Huang, Xiaosong Zhang, Xinlong Wang, Yueze Wang, Yufeng Cui","cross_cats":[],"headline":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","title":"Emu: Generative Pretraining in Multimodality"},"references":{"count":22,"internal_anchors":0,"resolved_work":22,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"and contains large-scale image-text pairs data. LAION-COCO (lai, b) is captioned 600M images from LAION-2B with an ensemble of BLIP (Li et al., 2022) and CLIP (Radford et al., 2021) models. Whereas th","work_id":"5714861e-873c-4f7f-8805-0a782e92c494","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Make sure to check the weather forecast before your visit and pack appropriate clothing and gear","work_id":"41abc708-2bee-41a2-87bb-3f33e004ff03","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Make sure to stay on designated trails and keep your distance from any wildlife you encounter","work_id":"2c556f4a-378a-4f7c-9de4-e1b4e017403e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Make sure to check with local authorities before swimming or boating in the lake to ensure it is safe to do so","work_id":"22cae4bf-9c5d-4c68-b95d-0795eb7d7dae","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Make sure to familiarize yourself with the lake's layout and any potential hazards before venturing out on the water","work_id":"eeed9ff6-fa73-4fcf-9e71-e074f699acd3","year":null}],"snapshot_sha256":"ffebb0f1446b0e62db4715f8ca1ae85467d04d8d19f93362818d69444e5803d5"},"source":{"id":"2307.05222","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T20:18:23.073660Z","id":"eb77e0f9-4408-4b15-900f-eb5ebba5a48b","model_set":{"reader":"grok-4.3"},"one_line_summary":"Emu is a multimodal foundation model that unifies image and text generation via autoregressive pretraining on interleaved multimodal data, showing strong zero-shot performance on captioning, VQA, and text-to-image tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs.","strongest_claim":"Across a broad range of zero-shot/few-shot tasks including image captioning, visual question answering, video question answering and text-to-image generation, Emu demonstrates superb performance compared to state-of-the-art large multimodal models.","weakest_assumption":"That encoding visual signals into embeddings and training with a unified next-token or next-embedding objective will produce coherent multimodal generation without modality-specific losses or architectures."}},"verdict_id":"eb77e0f9-4408-4b15-900f-eb5ebba5a48b"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f5d4451a7f5af0420e5b014c4c0999273ce9bfaf562f064e17e02996bb11342d","target":"record","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"46600ae81117b6ccc061a48ecab7a9f32c222fa5a904f09d6e372d0982f5c569","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","title_canon_sha256":"0bc9d5de0efbc6d14eda4e338377e6441a0a00c832fa1e567e5af7edca242882"},"schema_version":"1.0","source":{"id":"2307.05222","kind":"arxiv","version":2}},"canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","first_computed_at":"2026-05-17T23:38:46.700545Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:46.700545Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"vIeUDVmv2CUaRTXZbmXI07Y9a3Q5tJvEi8+NCQVVWx3PHznBslRdGFpgXXE6thD1/hqzgAG2w6stsJwtu34RBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:46.701161Z","signed_message":"canonical_sha256_bytes"},"source_id":"2307.05222","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f5d4451a7f5af0420e5b014c4c0999273ce9bfaf562f064e17e02996bb11342d","sha256:9b066a2d3582e522c203bb402db6068975a698da16a7a3a99b94315d6fa06f29"],"state_sha256":"81e3fd73d73609ebbc77011559a292fad775024c952c3e06b97c5f4d8743a89e"}