{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:4HJNTCXO2WO2MWAA5L4UQHONPO","short_pith_number":"pith:4HJNTCXO","canonical_record":{"source":{"id":"2404.14396","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-22T17:56:09Z","cross_cats_sorted":[],"title_canon_sha256":"aa793a029a6f4e2269cf1bf47ecffec1d1eae171983b0e7fb6412ef03ddb001a","abstract_canon_sha256":"9c00008e7c80a94f887a8c3de96e0de64c122983d342180947e64610740221f4"},"schema_version":"1.0"},"canonical_sha256":"e1d2d98aeed59da65800eaf9481dcd7bb07925b3977e9f4e3f155cb13bd9a37e","source":{"kind":"arxiv","id":"2404.14396","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.14396","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2404.14396v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.14396","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"4HJNTCXO2WO2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4HJNTCXO2WO2MWAA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4HJNTCXO","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:4HJNTCXO2WO2MWAA5L4UQHONPO","target":"record","payload":{"canonical_record":{"source":{"id":"2404.14396","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-22T17:56:09Z","cross_cats_sorted":[],"title_canon_sha256":"aa793a029a6f4e2269cf1bf47ecffec1d1eae171983b0e7fb6412ef03ddb001a","abstract_canon_sha256":"9c00008e7c80a94f887a8c3de96e0de64c122983d342180947e64610740221f4"},"schema_version":"1.0"},"canonical_sha256":"e1d2d98aeed59da65800eaf9481dcd7bb07925b3977e9f4e3f155cb13bd9a37e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.884214Z","signature_b64":"MNXL2IM7lPNGU5CXDlo+IDfVr64qRxorVimP8l1i5MgC/+VYXzCma0LpIbCu7JTDi1Uapn/of8xsecbnAblUDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e1d2d98aeed59da65800eaf9481dcd7bb07925b3977e9f4e3f155cb13bd9a37e","last_reissued_at":"2026-05-17T23:38:49.883568Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.883568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2404.14396","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"wS5Pu5RR+ka/bYJ3XcJNIZfKgZL2zrge7EFIvyayYVR8zSondGUAneBmk9DHclI9ztd1jRXqKz8DiwfuEb4MAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T14:30:19.939948Z"},"content_sha256":"2ed67411178f8f5b5d764848b7890839acf3cd8df955915bdb6cecc75c5327d7","schema_version":"1.0","event_id":"sha256:2ed67411178f8f5b5d764848b7890839acf3cd8df955915bdb6cecc75c5327d7"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:4HJNTCXO2WO2MWAA5L4UQHONPO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension and Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"SEED-X is a single multimodal model that comprehends arbitrary-sized images and generates at multiple levels of detail.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chen Li, Jinguo Zhu, Kun Yi, Lin Song, Sijie Zhao, Xiaohan Ding, Ying Shan, Yixiao Ge, Yuying Ge","submitted_at":"2024-04-22T17:56:09Z","abstract_excerpt":"The rapid evolution of multimodal foundation model has demonstrated significant progresses in vision-language understanding and generation, e.g., our previous work SEED-LLaMA. However, there remains a gap between its capability and the real-world applicability, primarily due to the model's limited capacity to effectively respond to various user instructions and interact with diverse visual data. In this work, we focus on bridging this gap through integrating two enhanced features: (1) comprehending images of arbitrary sizes and ratios, and (2) enabling multi-granularity image generation. We pr"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We present a unified and versatile foundation model, namely, SEED-X, which is able to model multi-granularity visual semantics for comprehension and generation tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That integrating arbitrary-size image comprehension and multi-granularity generation will close the gap between current model capabilities and real-world applicability, assuming successful instruction tuning preserves performance without introducing new limitations.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SEED-X is a unified multimodal foundation model that handles multi-granularity visual semantics for both comprehension and generation across arbitrary image sizes and ratios.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SEED-X is a single multimodal model that comprehends arbitrary-sized images and generates at multiple levels of detail.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8700383e5079f95aef792fb755d3985aa75af1d085e83f9033d9375482034419"},"source":{"id":"2404.14396","kind":"arxiv","version":2},"verdict":{"id":"c2651623-20f4-422c-8e77-fd11ed33b692","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:45:38.031890Z","strongest_claim":"We present a unified and versatile foundation model, namely, SEED-X, which is able to model multi-granularity visual semantics for comprehension and generation tasks.","one_line_summary":"SEED-X is a unified multimodal foundation model that handles multi-granularity visual semantics for both comprehension and generation across arbitrary image sizes and ratios.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That integrating arbitrary-size image comprehension and multi-granularity generation will close the gap between current model capabilities and real-world applicability, assuming successful instruction tuning preserves performance without introducing new limitations.","pith_extraction_headline":"SEED-X is a single multimodal model that comprehends arbitrary-sized images and generates at multiple levels of detail."},"references":{"count":76,"sample":[{"doi":"","year":2023,"title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","work_id":"0717b0f5-1407-4005-9f21-4e2907f265d7","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models","work_id":"a7e3a737-e007-42bc-be89-c4d34c5ee071","ref_index":2,"cited_arxiv_id":"2304.10592","is_internal_anchor":true},{"doi":"","year":2023,"title":"Visual Instruction Tuning","work_id":"68be622d-a6dc-4a13-82de-e3054a3dc509","ref_index":3,"cited_arxiv_id":"2304.08485","is_internal_anchor":true},{"doi":"","year":2023,"title":"Kosmos-2: Grounding Multimodal Large Language Models to the World","work_id":"46e7f9e9-24c6-49af-b7d5-96159fa6f443","ref_index":4,"cited_arxiv_id":"2306.14824","is_internal_anchor":true},{"doi":"","year":2023,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":5,"cited_arxiv_id":"2308.12966","is_internal_anchor":true}],"resolved_work":76,"snapshot_sha256":"ce315cfe1628555e5f7c5854a6e24f3e58657e368f64b7b625e7d03c77fd72a6","internal_anchors":29},"formal_canon":{"evidence_count":1,"snapshot_sha256":"a2e65a65d39cdd3268ebc2434f43f263d61c09f1db0638bc7c6f491b98c61c67"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c2651623-20f4-422c-8e77-fd11ed33b692"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J+B3sHgJ9hj8cLyzBjkFUYrWXSUpXZyYzTdiha2nVFGgnf74uB36dpGEXBeuYPxyFT62okY79yqZbmaug9AZBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T14:30:19.941031Z"},"content_sha256":"c0d388f431f5f8d3eaa3fa02669b51d4ad3d319ca440a5182cd87f563aec2a85","schema_version":"1.0","event_id":"sha256:c0d388f431f5f8d3eaa3fa02669b51d4ad3d319ca440a5182cd87f563aec2a85"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/4HJNTCXO2WO2MWAA5L4UQHONPO/bundle.json","state_url":"https://pith.science/pith/4HJNTCXO2WO2MWAA5L4UQHONPO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/4HJNTCXO2WO2MWAA5L4UQHONPO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T14:30:19Z","links":{"resolver":"https://pith.science/pith/4HJNTCXO2WO2MWAA5L4UQHONPO","bundle":"https://pith.science/pith/4HJNTCXO2WO2MWAA5L4UQHONPO/bundle.json","state":"https://pith.science/pith/4HJNTCXO2WO2MWAA5L4UQHONPO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/4HJNTCXO2WO2MWAA5L4UQHONPO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:4HJNTCXO2WO2MWAA5L4UQHONPO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"9c00008e7c80a94f887a8c3de96e0de64c122983d342180947e64610740221f4","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-22T17:56:09Z","title_canon_sha256":"aa793a029a6f4e2269cf1bf47ecffec1d1eae171983b0e7fb6412ef03ddb001a"},"schema_version":"1.0","source":{"id":"2404.14396","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.14396","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2404.14396v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.14396","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"4HJNTCXO2WO2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"4HJNTCXO2WO2MWAA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"4HJNTCXO","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c0d388f431f5f8d3eaa3fa02669b51d4ad3d319ca440a5182cd87f563aec2a85","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We present a unified and versatile foundation model, namely, SEED-X, which is able to model multi-granularity visual semantics for comprehension and generation tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That integrating arbitrary-size image comprehension and multi-granularity generation will close the gap between current model capabilities and real-world applicability, assuming successful instruction tuning preserves performance without introducing new limitations."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SEED-X is a unified multimodal foundation model that handles multi-granularity visual semantics for both comprehension and generation across arbitrary image sizes and ratios."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"SEED-X is a single multimodal model that comprehends arbitrary-sized images and generates at multiple levels of detail."}],"snapshot_sha256":"8700383e5079f95aef792fb755d3985aa75af1d085e83f9033d9375482034419"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"a2e65a65d39cdd3268ebc2434f43f263d61c09f1db0638bc7c6f491b98c61c67"},"paper":{"abstract_excerpt":"The rapid evolution of multimodal foundation model has demonstrated significant progresses in vision-language understanding and generation, e.g., our previous work SEED-LLaMA. However, there remains a gap between its capability and the real-world applicability, primarily due to the model's limited capacity to effectively respond to various user instructions and interact with diverse visual data. In this work, we focus on bridging this gap through integrating two enhanced features: (1) comprehending images of arbitrary sizes and ratios, and (2) enabling multi-granularity image generation. We pr","authors_text":"Chen Li, Jinguo Zhu, Kun Yi, Lin Song, Sijie Zhao, Xiaohan Ding, Ying Shan, Yixiao Ge, Yuying Ge","cross_cats":[],"headline":"SEED-X is a single multimodal model that comprehends arbitrary-sized images and generates at multiple levels of detail.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-22T17:56:09Z","title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension and Generation"},"references":{"count":76,"internal_anchors":29,"resolved_work":76,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","work_id":"0717b0f5-1407-4005-9f21-4e2907f265d7","year":2023},{"cited_arxiv_id":"2304.10592","doi":"","is_internal_anchor":true,"ref_index":2,"title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models","work_id":"a7e3a737-e007-42bc-be89-c4d34c5ee071","year":2023},{"cited_arxiv_id":"2304.08485","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Visual Instruction Tuning","work_id":"68be622d-a6dc-4a13-82de-e3054a3dc509","year":2023},{"cited_arxiv_id":"2306.14824","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Kosmos-2: Grounding Multimodal Large Language Models to the World","work_id":"46e7f9e9-24c6-49af-b7d5-96159fa6f443","year":2023},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023}],"snapshot_sha256":"ce315cfe1628555e5f7c5854a6e24f3e58657e368f64b7b625e7d03c77fd72a6"},"source":{"id":"2404.14396","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T22:45:38.031890Z","id":"c2651623-20f4-422c-8e77-fd11ed33b692","model_set":{"reader":"grok-4.3"},"one_line_summary":"SEED-X is a unified multimodal foundation model that handles multi-granularity visual semantics for both comprehension and generation across arbitrary image sizes and ratios.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"SEED-X is a single multimodal model that comprehends arbitrary-sized images and generates at multiple levels of detail.","strongest_claim":"We present a unified and versatile foundation model, namely, SEED-X, which is able to model multi-granularity visual semantics for comprehension and generation tasks.","weakest_assumption":"That integrating arbitrary-size image comprehension and multi-granularity generation will close the gap between current model capabilities and real-world applicability, assuming successful instruction tuning preserves performance without introducing new limitations."}},"verdict_id":"c2651623-20f4-422c-8e77-fd11ed33b692"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2ed67411178f8f5b5d764848b7890839acf3cd8df955915bdb6cecc75c5327d7","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"9c00008e7c80a94f887a8c3de96e0de64c122983d342180947e64610740221f4","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-04-22T17:56:09Z","title_canon_sha256":"aa793a029a6f4e2269cf1bf47ecffec1d1eae171983b0e7fb6412ef03ddb001a"},"schema_version":"1.0","source":{"id":"2404.14396","kind":"arxiv","version":2}},"canonical_sha256":"e1d2d98aeed59da65800eaf9481dcd7bb07925b3977e9f4e3f155cb13bd9a37e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e1d2d98aeed59da65800eaf9481dcd7bb07925b3977e9f4e3f155cb13bd9a37e","first_computed_at":"2026-05-17T23:38:49.883568Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.883568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"MNXL2IM7lPNGU5CXDlo+IDfVr64qRxorVimP8l1i5MgC/+VYXzCma0LpIbCu7JTDi1Uapn/of8xsecbnAblUDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.884214Z","signed_message":"canonical_sha256_bytes"},"source_id":"2404.14396","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2ed67411178f8f5b5d764848b7890839acf3cd8df955915bdb6cecc75c5327d7","sha256:c0d388f431f5f8d3eaa3fa02669b51d4ad3d319ca440a5182cd87f563aec2a85"],"state_sha256":"b563398e056eb4c9ac046c934776d269698f8a2d92afe19fa53fb5348841ca86"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YcBkq8vEvwof7vduHDLfTSzqUZDEtmHH1AMeMHLxmNVHu82Q7NG/DxmRq/neYxVFzayRr07gsAZFzdLbvD+4Cw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T14:30:19.945590Z","bundle_sha256":"6f7364f4bb3db545b159320c28bd9467a69c84a9fca6c47b81e1f1098efe91d5"}}