{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:YAOHT3YAN4BJN2T6TGUDENXI6V","short_pith_number":"pith:YAOHT3YA","canonical_record":{"source":{"id":"2401.16420","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T18:59:02Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"3b2deff91597c496b7dbbec7f1d2f0eaef3a13ef574dfa65915194c7ee757aa0","abstract_canon_sha256":"8429ed639989a4121da3104fc4fd2393bc12545d4777baa9397279c0f2651057"},"schema_version":"1.0"},"canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","source":{"kind":"arxiv","id":"2401.16420","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.16420","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2401.16420v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.16420","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"YAOHT3YAN4BJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"YAOHT3YAN4BJN2T6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"YAOHT3YA","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:YAOHT3YAN4BJN2T6TGUDENXI6V","target":"record","payload":{"canonical_record":{"source":{"id":"2401.16420","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T18:59:02Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"3b2deff91597c496b7dbbec7f1d2f0eaef3a13ef574dfa65915194c7ee757aa0","abstract_canon_sha256":"8429ed639989a4121da3104fc4fd2393bc12545d4777baa9397279c0f2651057"},"schema_version":"1.0"},"canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.981923Z","signature_b64":"eB8QVqcGx0AnSAhToN6lvwiDRLV3MpPn7NMCWq1sBxB1bLntVUtqnbaRekO2IyLThmYEvPOeVme9iEnFx6kzBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","last_reissued_at":"2026-05-17T23:38:14.981310Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.981310Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2401.16420","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LAhJOSvh4uaaYhWFcz8l7qe4JLrkJ2Th0n8EL6HyyX5C+z/NrSIfkPNYeYFjT6cCfBYTGz4VmJJbQHfxEtw0CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T07:49:52.194864Z"},"content_sha256":"40e79aab0d016a66ad08485015851a55b8c001de7f1a037bcf1cebd66eec04ed","schema_version":"1.0","event_id":"sha256:40e79aab0d016a66ad08485015851a55b8c001de7f1a037bcf1cebd66eec04ed"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:YAOHT3YAN4BJN2T6TGUDENXI6V","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bin Wang, Conghui He, Dahua Lin, Hang Yan, Haodong Duan, Jiaqi Wang, Jingwen Li, Kai Chen, Linke Ouyang, Maosong Cao, Pan Zhang, Songyang Zhang, Wei Li, Wenwei Zhang, Xiaoyi Dong, Xilin Wei, Xingcheng Zhang, Xinyue Zhang, Yang Gao, Yining Li, Yuhang Cao, Yuhang Zang, Yu Qiao","submitted_at":"2024-01-29T18:59:02Z","abstract_excerpt":"We introduce InternLM-XComposer2, a cutting-edge vision-language model excelling in free-form text-image composition and comprehension. This model goes beyond conventional vision-language understanding, adeptly crafting interleaved text-image content from diverse inputs like outlines, detailed textual specifications, and reference images, enabling highly customizable content creation. InternLM-XComposer2 proposes a Partial LoRA (PLoRA) approach that applies additional LoRA parameters exclusively to image tokens to preserve the integrity of pre-trained language knowledge, striking a balance bet"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"InternLM-XComposer2 ... not only significantly outperforms existing multimodal models but also matches or even surpasses GPT-4V and Gemini Pro in certain assessments.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That applying additional LoRA parameters exclusively to image tokens preserves the integrity of pre-trained language knowledge while enabling precise vision understanding and high-quality text composition.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"InternLM-XComposer2 introduces Partial LoRA on InternLM2-7B to enable high-quality free-form text-image composition while matching or exceeding GPT-4V on select vision-language benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"20db5fdaa366d3ea375d5eaf133b4d1d5c8a8ec235fee8cb1c3e40b55fbc1945"},"source":{"id":"2401.16420","kind":"arxiv","version":1},"verdict":{"id":"c9f99aae-cc4e-4226-924d-718eb050c6ca","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T05:24:56.204955Z","strongest_claim":"InternLM-XComposer2 ... not only significantly outperforms existing multimodal models but also matches or even surpasses GPT-4V and Gemini Pro in certain assessments.","one_line_summary":"InternLM-XComposer2 introduces Partial LoRA on InternLM2-7B to enable high-quality free-form text-image composition while matching or exceeding GPT-4V on select vision-language benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That applying additional LoRA parameters exclusively to image tokens preserves the integrity of pre-trained language knowledge while enabling precise vision understanding and high-quality text composition.","pith_extraction_headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens."},"references":{"count":105,"sample":[{"doi":"","year":null,"title":"Nocaps: Novel object captioning at scale","work_id":"d3565525-ac21-4347-be08-bd76632f4d65","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Flamingo: a visual language model for few-shot learning,","work_id":"15887c25-c51f-4381-9fe0-7afe4a3002b7","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1905,"title":"arXiv preprint arXiv:1905.13319 , year=","work_id":"4539c966-2fd4-4238-88a9-60be171a99da","ref_index":3,"cited_arxiv_id":"1905.13319","is_internal_anchor":true},{"doi":"","year":2015,"title":"Lawrence Zitnick, and Devi Parikh","work_id":"1b370a2a-fb0c-43c7-87c5-2ee5a7b8d1f5","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Openflamingo: An open- source framework for training large autoregressive vision- language models","work_id":"9e776b2e-2e57-4719-a352-b4c2ded802d2","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":105,"snapshot_sha256":"23ba002c02aba91f1658a8d8f13389cd310e0b6ddb6d7ed620ce9e4ef2eb49cb","internal_anchors":16},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6aa19b876cef739591a07a1ed75005208dd0001ce3cc60be404d95ae2df23f47"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c9f99aae-cc4e-4226-924d-718eb050c6ca"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"alP8HO5xAAh8EiIqDNxsGUluYrUgKobkkja0wMVsHISWi7wfGl15Qf3TzgVgUlswfcDsi9yweUCQD84dlAXnDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T07:49:52.195921Z"},"content_sha256":"aeb4b9b933eb79448ec7650b4d463d345ef887387d50eff9a6cab62d0679da48","schema_version":"1.0","event_id":"sha256:aeb4b9b933eb79448ec7650b4d463d345ef887387d50eff9a6cab62d0679da48"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/bundle.json","state_url":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-19T07:49:52Z","links":{"resolver":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V","bundle":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/bundle.json","state":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/state.json","well_known_bundle":"https://pith.science/.well-known/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:YAOHT3YAN4BJN2T6TGUDENXI6V","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"8429ed639989a4121da3104fc4fd2393bc12545d4777baa9397279c0f2651057","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T18:59:02Z","title_canon_sha256":"3b2deff91597c496b7dbbec7f1d2f0eaef3a13ef574dfa65915194c7ee757aa0"},"schema_version":"1.0","source":{"id":"2401.16420","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2401.16420","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2401.16420v1","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.16420","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"YAOHT3YAN4BJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"YAOHT3YAN4BJN2T6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"YAOHT3YA","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:aeb4b9b933eb79448ec7650b4d463d345ef887387d50eff9a6cab62d0679da48","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"InternLM-XComposer2 ... not only significantly outperforms existing multimodal models but also matches or even surpasses GPT-4V and Gemini Pro in certain assessments."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That applying additional LoRA parameters exclusively to image tokens preserves the integrity of pre-trained language knowledge while enabling precise vision understanding and high-quality text composition."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"InternLM-XComposer2 introduces Partial LoRA on InternLM2-7B to enable high-quality free-form text-image composition while matching or exceeding GPT-4V on select vision-language benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens."}],"snapshot_sha256":"20db5fdaa366d3ea375d5eaf133b4d1d5c8a8ec235fee8cb1c3e40b55fbc1945"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6aa19b876cef739591a07a1ed75005208dd0001ce3cc60be404d95ae2df23f47"},"paper":{"abstract_excerpt":"We introduce InternLM-XComposer2, a cutting-edge vision-language model excelling in free-form text-image composition and comprehension. This model goes beyond conventional vision-language understanding, adeptly crafting interleaved text-image content from diverse inputs like outlines, detailed textual specifications, and reference images, enabling highly customizable content creation. InternLM-XComposer2 proposes a Partial LoRA (PLoRA) approach that applies additional LoRA parameters exclusively to image tokens to preserve the integrity of pre-trained language knowledge, striking a balance bet","authors_text":"Bin Wang, Conghui He, Dahua Lin, Hang Yan, Haodong Duan, Jiaqi Wang, Jingwen Li, Kai Chen, Linke Ouyang, Maosong Cao, Pan Zhang, Songyang Zhang, Wei Li, Wenwei Zhang, Xiaoyi Dong, Xilin Wei, Xingcheng Zhang, Xinyue Zhang, Yang Gao, Yining Li, Yuhang Cao, Yuhang Zang, Yu Qiao","cross_cats":["cs.CL"],"headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T18:59:02Z","title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model"},"references":{"count":105,"internal_anchors":16,"resolved_work":105,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Nocaps: Novel object captioning at scale","work_id":"d3565525-ac21-4347-be08-bd76632f4d65","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Flamingo: a visual language model for few-shot learning,","work_id":"15887c25-c51f-4381-9fe0-7afe4a3002b7","year":null},{"cited_arxiv_id":"1905.13319","doi":"","is_internal_anchor":true,"ref_index":3,"title":"arXiv preprint arXiv:1905.13319 , year=","work_id":"4539c966-2fd4-4238-88a9-60be171a99da","year":1905},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Lawrence Zitnick, and Devi Parikh","work_id":"1b370a2a-fb0c-43c7-87c5-2ee5a7b8d1f5","year":2015},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Openflamingo: An open- source framework for training large autoregressive vision- language models","work_id":"9e776b2e-2e57-4719-a352-b4c2ded802d2","year":2023}],"snapshot_sha256":"23ba002c02aba91f1658a8d8f13389cd310e0b6ddb6d7ed620ce9e4ef2eb49cb"},"source":{"id":"2401.16420","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T05:24:56.204955Z","id":"c9f99aae-cc4e-4226-924d-718eb050c6ca","model_set":{"reader":"grok-4.3"},"one_line_summary":"InternLM-XComposer2 introduces Partial LoRA on InternLM2-7B to enable high-quality free-form text-image composition while matching or exceeding GPT-4V on select vision-language benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","strongest_claim":"InternLM-XComposer2 ... not only significantly outperforms existing multimodal models but also matches or even surpasses GPT-4V and Gemini Pro in certain assessments.","weakest_assumption":"That applying additional LoRA parameters exclusively to image tokens preserves the integrity of pre-trained language knowledge while enabling precise vision understanding and high-quality text composition."}},"verdict_id":"c9f99aae-cc4e-4226-924d-718eb050c6ca"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:40e79aab0d016a66ad08485015851a55b8c001de7f1a037bcf1cebd66eec04ed","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"8429ed639989a4121da3104fc4fd2393bc12545d4777baa9397279c0f2651057","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T18:59:02Z","title_canon_sha256":"3b2deff91597c496b7dbbec7f1d2f0eaef3a13ef574dfa65915194c7ee757aa0"},"schema_version":"1.0","source":{"id":"2401.16420","kind":"arxiv","version":1}},"canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","first_computed_at":"2026-05-17T23:38:14.981310Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.981310Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"eB8QVqcGx0AnSAhToN6lvwiDRLV3MpPn7NMCWq1sBxB1bLntVUtqnbaRekO2IyLThmYEvPOeVme9iEnFx6kzBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.981923Z","signed_message":"canonical_sha256_bytes"},"source_id":"2401.16420","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:40e79aab0d016a66ad08485015851a55b8c001de7f1a037bcf1cebd66eec04ed","sha256:aeb4b9b933eb79448ec7650b4d463d345ef887387d50eff9a6cab62d0679da48"],"state_sha256":"4669f59d638e895a218a9a9a513b93b9882ebc0c69f05e5815b42f3b975c17d5"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xRC+evi7nh8CVzFY3dVe0pjo+KOdAuhZPiGQpZOOdl7l4V7RPwDUXw1/ktnLrbBTc0CqrsfHsMwB48C8qtvUCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-19T07:49:52.199815Z","bundle_sha256":"bc8067756ff4ed4933294338e55427194cd5657f62cf4d2e3b739e4313e9cb3a"}}