{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:ID7PWW53VUCANRI32L4VIQM256","short_pith_number":"pith:ID7PWW53","canonical_record":{"source":{"id":"2307.05222","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","cross_cats_sorted":[],"title_canon_sha256":"0bc9d5de0efbc6d14eda4e338377e6441a0a00c832fa1e567e5af7edca242882","abstract_canon_sha256":"46600ae81117b6ccc061a48ecab7a9f32c222fa5a904f09d6e372d0982f5c569"},"schema_version":"1.0"},"canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","source":{"kind":"arxiv","id":"2307.05222","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.05222","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2307.05222v2","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.05222","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"ID7PWW53VUCA","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"ID7PWW53VUCANRI3","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"ID7PWW53","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:ID7PWW53VUCANRI32L4VIQM256","target":"record","payload":{"canonical_record":{"source":{"id":"2307.05222","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","cross_cats_sorted":[],"title_canon_sha256":"0bc9d5de0efbc6d14eda4e338377e6441a0a00c832fa1e567e5af7edca242882","abstract_canon_sha256":"46600ae81117b6ccc061a48ecab7a9f32c222fa5a904f09d6e372d0982f5c569"},"schema_version":"1.0"},"canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.701161Z","signature_b64":"vIeUDVmv2CUaRTXZbmXI07Y9a3Q5tJvEi8+NCQVVWx3PHznBslRdGFpgXXE6thD1/hqzgAG2w6stsJwtu34RBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","last_reissued_at":"2026-05-17T23:38:46.700545Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.700545Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2307.05222","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+aVwUTx27slzQBPIffCNrQzpcW2X7grGzZeHc04L7LvbJ+GgwtSPsh8N6wZ6VBSIulGkT3QpDEzoT7ABFj/KCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-21T11:17:08.435760Z"},"content_sha256":"f5d4451a7f5af0420e5b014c4c0999273ce9bfaf562f064e17e02996bb11342d","schema_version":"1.0","event_id":"sha256:f5d4451a7f5af0420e5b014c4c0999273ce9bfaf562f064e17e02996bb11342d"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:ID7PWW53VUCANRI32L4VIQM256","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Emu: Generative Pretraining in Multimodality","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Fan Zhang, Hongcheng Gao, Jingjing Liu, Qiying Yu, Quan Sun, Tiejun Huang, Xiaosong Zhang, Xinlong Wang, Yueze Wang, Yufeng Cui","submitted_at":"2023-07-11T12:45:39Z","abstract_excerpt":"We present Emu, a Transformer-based multimodal foundation model, which can seamlessly generate images and texts in multimodal context. This omnivore model can take in any single-modality or multimodal data input indiscriminately (e.g., interleaved image, text and video) through a one-model-for-all autoregressive training process. First, visual signals are encoded into embeddings, and together with text tokens form an interleaved input sequence. Emu is then end-to-end trained with a unified objective of classifying the next text token or regressing the next visual embedding in the multimodal se"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across a broad range of zero-shot/few-shot tasks including image captioning, visual question answering, video question answering and text-to-image generation, Emu demonstrates superb performance compared to state-of-the-art large multimodal models.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That encoding visual signals into embeddings and training with a unified next-token or next-embedding objective will produce coherent multimodal generation without modality-specific losses or architectures.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Emu is a multimodal foundation model that unifies image and text generation via autoregressive pretraining on interleaved multimodal data, showing strong zero-shot performance on captioning, VQA, and text-to-image tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"5072c2a8cf7419115bf0b0457f33763a63dff986b79e3cb73c852a5e32e703b3"},"source":{"id":"2307.05222","kind":"arxiv","version":2},"verdict":{"id":"eb77e0f9-4408-4b15-900f-eb5ebba5a48b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T20:18:23.073660Z","strongest_claim":"Across a broad range of zero-shot/few-shot tasks including image captioning, visual question answering, video question answering and text-to-image generation, Emu demonstrates superb performance compared to state-of-the-art large multimodal models.","one_line_summary":"Emu is a multimodal foundation model that unifies image and text generation via autoregressive pretraining on interleaved multimodal data, showing strong zero-shot performance on captioning, VQA, and text-to-image tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That encoding visual signals into embeddings and training with a unified next-token or next-embedding objective will produce coherent multimodal generation without modality-specific losses or architectures.","pith_extraction_headline":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs."},"references":{"count":22,"sample":[{"doi":"","year":2022,"title":"and contains large-scale image-text pairs data. LAION-COCO (lai, b) is captioned 600M images from LAION-2B with an ensemble of BLIP (Li et al., 2022) and CLIP (Radford et al., 2021) models. Whereas th","work_id":"5714861e-873c-4f7f-8805-0a782e92c494","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Make sure to check the weather forecast before your visit and pack appropriate clothing and gear","work_id":"41abc708-2bee-41a2-87bb-3f33e004ff03","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Make sure to stay on designated trails and keep your distance from any wildlife you encounter","work_id":"2c556f4a-378a-4f7c-9de4-e1b4e017403e","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Make sure to check with local authorities before swimming or boating in the lake to ensure it is safe to do so","work_id":"22cae4bf-9c5d-4c68-b95d-0795eb7d7dae","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Make sure to familiarize yourself with the lake's layout and any potential hazards before venturing out on the water","work_id":"eeed9ff6-fa73-4fcf-9e71-e074f699acd3","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":22,"snapshot_sha256":"ffebb0f1446b0e62db4715f8ca1ae85467d04d8d19f93362818d69444e5803d5","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"455f3d72a31ca83ed0371ac902779939b0be8d90400205b164a5df3a1801ccea"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"eb77e0f9-4408-4b15-900f-eb5ebba5a48b"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:46Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"k3CEz8FWnbYi3bfyULOFzNt9fQ/D0sTr4fVqKFEJoabFdsFcBMfrQHqZ02DuYts80np1k8kCqnEWhECYh2qhBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-21T11:17:08.436298Z"},"content_sha256":"9b066a2d3582e522c203bb402db6068975a698da16a7a3a99b94315d6fa06f29","schema_version":"1.0","event_id":"sha256:9b066a2d3582e522c203bb402db6068975a698da16a7a3a99b94315d6fa06f29"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ID7PWW53VUCANRI32L4VIQM256/bundle.json","state_url":"https://pith.science/pith/ID7PWW53VUCANRI32L4VIQM256/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ID7PWW53VUCANRI32L4VIQM256/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-21T11:17:08Z","links":{"resolver":"https://pith.science/pith/ID7PWW53VUCANRI32L4VIQM256","bundle":"https://pith.science/pith/ID7PWW53VUCANRI32L4VIQM256/bundle.json","state":"https://pith.science/pith/ID7PWW53VUCANRI32L4VIQM256/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ID7PWW53VUCANRI32L4VIQM256/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:ID7PWW53VUCANRI32L4VIQM256","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"46600ae81117b6ccc061a48ecab7a9f32c222fa5a904f09d6e372d0982f5c569","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","title_canon_sha256":"0bc9d5de0efbc6d14eda4e338377e6441a0a00c832fa1e567e5af7edca242882"},"schema_version":"1.0","source":{"id":"2307.05222","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2307.05222","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"arxiv_version","alias_value":"2307.05222v2","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.05222","created_at":"2026-05-17T23:38:46Z"},{"alias_kind":"pith_short_12","alias_value":"ID7PWW53VUCA","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"ID7PWW53VUCANRI3","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"ID7PWW53","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:9b066a2d3582e522c203bb402db6068975a698da16a7a3a99b94315d6fa06f29","target":"graph","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across a broad range of zero-shot/few-shot tasks including image captioning, visual question answering, video question answering and text-to-image generation, Emu demonstrates superb performance compared to state-of-the-art large multimodal models."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That encoding visual signals into embeddings and training with a unified next-token or next-embedding objective will produce coherent multimodal generation without modality-specific losses or architectures."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Emu is a multimodal foundation model that unifies image and text generation via autoregressive pretraining on interleaved multimodal data, showing strong zero-shot performance on captioning, VQA, and text-to-image tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs."}],"snapshot_sha256":"5072c2a8cf7419115bf0b0457f33763a63dff986b79e3cb73c852a5e32e703b3"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"455f3d72a31ca83ed0371ac902779939b0be8d90400205b164a5df3a1801ccea"},"paper":{"abstract_excerpt":"We present Emu, a Transformer-based multimodal foundation model, which can seamlessly generate images and texts in multimodal context. This omnivore model can take in any single-modality or multimodal data input indiscriminately (e.g., interleaved image, text and video) through a one-model-for-all autoregressive training process. First, visual signals are encoded into embeddings, and together with text tokens form an interleaved input sequence. Emu is then end-to-end trained with a unified objective of classifying the next text token or regressing the next visual embedding in the multimodal se","authors_text":"Fan Zhang, Hongcheng Gao, Jingjing Liu, Qiying Yu, Quan Sun, Tiejun Huang, Xiaosong Zhang, Xinlong Wang, Yueze Wang, Yufeng Cui","cross_cats":[],"headline":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","title":"Emu: Generative Pretraining in Multimodality"},"references":{"count":22,"internal_anchors":0,"resolved_work":22,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"and contains large-scale image-text pairs data. LAION-COCO (lai, b) is captioned 600M images from LAION-2B with an ensemble of BLIP (Li et al., 2022) and CLIP (Radford et al., 2021) models. Whereas th","work_id":"5714861e-873c-4f7f-8805-0a782e92c494","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Make sure to check the weather forecast before your visit and pack appropriate clothing and gear","work_id":"41abc708-2bee-41a2-87bb-3f33e004ff03","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Make sure to stay on designated trails and keep your distance from any wildlife you encounter","work_id":"2c556f4a-378a-4f7c-9de4-e1b4e017403e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Make sure to check with local authorities before swimming or boating in the lake to ensure it is safe to do so","work_id":"22cae4bf-9c5d-4c68-b95d-0795eb7d7dae","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Make sure to familiarize yourself with the lake's layout and any potential hazards before venturing out on the water","work_id":"eeed9ff6-fa73-4fcf-9e71-e074f699acd3","year":null}],"snapshot_sha256":"ffebb0f1446b0e62db4715f8ca1ae85467d04d8d19f93362818d69444e5803d5"},"source":{"id":"2307.05222","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T20:18:23.073660Z","id":"eb77e0f9-4408-4b15-900f-eb5ebba5a48b","model_set":{"reader":"grok-4.3"},"one_line_summary":"Emu is a multimodal foundation model that unifies image and text generation via autoregressive pretraining on interleaved multimodal data, showing strong zero-shot performance on captioning, VQA, and text-to-image tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A single Transformer model generates images and text by autoregressively predicting the next token or visual embedding from interleaved inputs.","strongest_claim":"Across a broad range of zero-shot/few-shot tasks including image captioning, visual question answering, video question answering and text-to-image generation, Emu demonstrates superb performance compared to state-of-the-art large multimodal models.","weakest_assumption":"That encoding visual signals into embeddings and training with a unified next-token or next-embedding objective will produce coherent multimodal generation without modality-specific losses or architectures."}},"verdict_id":"eb77e0f9-4408-4b15-900f-eb5ebba5a48b"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f5d4451a7f5af0420e5b014c4c0999273ce9bfaf562f064e17e02996bb11342d","target":"record","created_at":"2026-05-17T23:38:46Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"46600ae81117b6ccc061a48ecab7a9f32c222fa5a904f09d6e372d0982f5c569","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-11T12:45:39Z","title_canon_sha256":"0bc9d5de0efbc6d14eda4e338377e6441a0a00c832fa1e567e5af7edca242882"},"schema_version":"1.0","source":{"id":"2307.05222","kind":"arxiv","version":2}},"canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"40fefb5bbbad0406c51bd2f954419aefbb8537aa8f92f967a5d8353a016028c6","first_computed_at":"2026-05-17T23:38:46.700545Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:46.700545Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"vIeUDVmv2CUaRTXZbmXI07Y9a3Q5tJvEi8+NCQVVWx3PHznBslRdGFpgXXE6thD1/hqzgAG2w6stsJwtu34RBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:46.701161Z","signed_message":"canonical_sha256_bytes"},"source_id":"2307.05222","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f5d4451a7f5af0420e5b014c4c0999273ce9bfaf562f064e17e02996bb11342d","sha256:9b066a2d3582e522c203bb402db6068975a698da16a7a3a99b94315d6fa06f29"],"state_sha256":"81e3fd73d73609ebbc77011559a292fad775024c952c3e06b97c5f4d8743a89e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"skVPjJ7vvVAhpOKQF4UhrrP+b7qgnFmwEP0vl9h9dpO4QO7P0Uvq7UsafAbnQ0bjD6dP1eFRKQQv/IaWhpudDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-21T11:17:08.438874Z","bundle_sha256":"6add9915a87849188861d9502626b92c91aa99b6fcd87affa5302261db7db27e"}}