{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:YAOHT3YAN4BJN2T6TGUDENXI6V","short_pith_number":"pith:YAOHT3YA","schema_version":"1.0","canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","source":{"kind":"arxiv","id":"2401.16420","version":1},"attestation_state":"computed","paper":{"title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bin Wang, Conghui He, Dahua Lin, Hang Yan, Haodong Duan, Jiaqi Wang, Jingwen Li, Kai Chen, Linke Ouyang, Maosong Cao, Pan Zhang, Songyang Zhang, Wei Li, Wenwei Zhang, Xiaoyi Dong, Xilin Wei, Xingcheng Zhang, Xinyue Zhang, Yang Gao, Yining Li, Yuhang Cao, Yuhang Zang, Yu Qiao","submitted_at":"2024-01-29T18:59:02Z","abstract_excerpt":"We introduce InternLM-XComposer2, a cutting-edge vision-language model excelling in free-form text-image composition and comprehension. This model goes beyond conventional vision-language understanding, adeptly crafting interleaved text-image content from diverse inputs like outlines, detailed textual specifications, and reference images, enabling highly customizable content creation. InternLM-XComposer2 proposes a Partial LoRA (PLoRA) approach that applies additional LoRA parameters exclusively to image tokens to preserve the integrity of pre-trained language knowledge, striking a balance bet"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2401.16420","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-01-29T18:59:02Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"3b2deff91597c496b7dbbec7f1d2f0eaef3a13ef574dfa65915194c7ee757aa0","abstract_canon_sha256":"8429ed639989a4121da3104fc4fd2393bc12545d4777baa9397279c0f2651057"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.981923Z","signature_b64":"eB8QVqcGx0AnSAhToN6lvwiDRLV3MpPn7NMCWq1sBxB1bLntVUtqnbaRekO2IyLThmYEvPOeVme9iEnFx6kzBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c01c79ef006f0296ea7e99a83236e8f5749d05e79699e4e0c8abe238914c4934","last_reissued_at":"2026-05-17T23:38:14.981310Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.981310Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bin Wang, Conghui He, Dahua Lin, Hang Yan, Haodong Duan, Jiaqi Wang, Jingwen Li, Kai Chen, Linke Ouyang, Maosong Cao, Pan Zhang, Songyang Zhang, Wei Li, Wenwei Zhang, Xiaoyi Dong, Xilin Wei, Xingcheng Zhang, Xinyue Zhang, Yang Gao, Yining Li, Yuhang Cao, Yuhang Zang, Yu Qiao","submitted_at":"2024-01-29T18:59:02Z","abstract_excerpt":"We introduce InternLM-XComposer2, a cutting-edge vision-language model excelling in free-form text-image composition and comprehension. This model goes beyond conventional vision-language understanding, adeptly crafting interleaved text-image content from diverse inputs like outlines, detailed textual specifications, and reference images, enabling highly customizable content creation. InternLM-XComposer2 proposes a Partial LoRA (PLoRA) approach that applies additional LoRA parameters exclusively to image tokens to preserve the integrity of pre-trained language knowledge, striking a balance bet"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"InternLM-XComposer2 ... not only significantly outperforms existing multimodal models but also matches or even surpasses GPT-4V and Gemini Pro in certain assessments.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That applying additional LoRA parameters exclusively to image tokens preserves the integrity of pre-trained language knowledge while enabling precise vision understanding and high-quality text composition.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"InternLM-XComposer2 introduces Partial LoRA on InternLM2-7B to enable high-quality free-form text-image composition while matching or exceeding GPT-4V on select vision-language benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"20db5fdaa366d3ea375d5eaf133b4d1d5c8a8ec235fee8cb1c3e40b55fbc1945"},"source":{"id":"2401.16420","kind":"arxiv","version":1},"verdict":{"id":"c9f99aae-cc4e-4226-924d-718eb050c6ca","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T05:24:56.204955Z","strongest_claim":"InternLM-XComposer2 ... not only significantly outperforms existing multimodal models but also matches or even surpasses GPT-4V and Gemini Pro in certain assessments.","one_line_summary":"InternLM-XComposer2 introduces Partial LoRA on InternLM2-7B to enable high-quality free-form text-image composition while matching or exceeding GPT-4V on select vision-language benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That applying additional LoRA parameters exclusively to image tokens preserves the integrity of pre-trained language knowledge while enabling precise vision understanding and high-quality text composition.","pith_extraction_headline":"InternLM-XComposer2 generates custom interleaved text-image content by applying LoRA parameters only to image tokens."},"references":{"count":105,"sample":[{"doi":"","year":null,"title":"Nocaps: Novel object captioning at scale","work_id":"d3565525-ac21-4347-be08-bd76632f4d65","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Flamingo: a visual language model for few-shot learning,","work_id":"15887c25-c51f-4381-9fe0-7afe4a3002b7","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1905,"title":"arXiv preprint arXiv:1905.13319 , year=","work_id":"4539c966-2fd4-4238-88a9-60be171a99da","ref_index":3,"cited_arxiv_id":"1905.13319","is_internal_anchor":true},{"doi":"","year":2015,"title":"Lawrence Zitnick, and Devi Parikh","work_id":"1b370a2a-fb0c-43c7-87c5-2ee5a7b8d1f5","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Openflamingo: An open- source framework for training large autoregressive vision- language models","work_id":"9e776b2e-2e57-4719-a352-b4c2ded802d2","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":105,"snapshot_sha256":"23ba002c02aba91f1658a8d8f13389cd310e0b6ddb6d7ed620ce9e4ef2eb49cb","internal_anchors":16},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6aa19b876cef739591a07a1ed75005208dd0001ce3cc60be404d95ae2df23f47"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2401.16420","created_at":"2026-05-17T23:38:14.981419+00:00"},{"alias_kind":"arxiv_version","alias_value":"2401.16420v1","created_at":"2026-05-17T23:38:14.981419+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2401.16420","created_at":"2026-05-17T23:38:14.981419+00:00"},{"alias_kind":"pith_short_12","alias_value":"YAOHT3YAN4BJ","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"YAOHT3YAN4BJN2T6","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"YAOHT3YA","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":21,"internal_anchor_count":21,"sample":[{"citing_arxiv_id":"2402.03766","citing_title":"MobileVLM V2: Faster and Stronger Baseline for Vision Language Model","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2510.21122","citing_title":"NoisyGRPO: Incentivizing Multimodal CoT Reasoning via Noise Injection and Bayesian Estimation","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2407.03320","citing_title":"InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2403.18814","citing_title":"Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2403.14624","citing_title":"MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual Math Problems?","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2407.01284","citing_title":"We-Math: Does Your Large Multimodal Model Achieve Human-like Mathematical Reasoning?","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2410.17434","citing_title":"LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2501.04001","citing_title":"Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2602.00181","citing_title":"CamReasoner: Reinforcing Camera Movement Understanding via Structured Spatial Reasoning","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2404.12390","citing_title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2311.16502","citing_title":"MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13775","citing_title":"RoboEvolve: Co-Evolving Planner-Simulator for Robotic Manipulation with Limited Data","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2404.16821","citing_title":"How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2403.20330","citing_title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06281","citing_title":"MMBench: Is Your Multi-modal Model an All-around Player?","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2503.01743","citing_title":"Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2403.05525","citing_title":"DeepSeek-VL: Towards Real-World Vision-Language Understanding","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10971","citing_title":"MMR-AD: A Large-Scale Multimodal Dataset for Benchmarking General Anomaly Detection with Multimodal Large Language Models","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2406.07476","citing_title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04838","citing_title":"Less Detail, Better Answers: Degradation-Driven Prompting for VQA","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14605","citing_title":"Towards Design Compositing","ref_index":12,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V","json":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V.json","graph_json":"https://pith.science/api/pith-number/YAOHT3YAN4BJN2T6TGUDENXI6V/graph.json","events_json":"https://pith.science/api/pith-number/YAOHT3YAN4BJN2T6TGUDENXI6V/events.json","paper":"https://pith.science/paper/YAOHT3YA"},"agent_actions":{"view_html":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V","download_json":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V.json","view_paper":"https://pith.science/paper/YAOHT3YA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2401.16420&json=true","fetch_graph":"https://pith.science/api/pith-number/YAOHT3YAN4BJN2T6TGUDENXI6V/graph.json","fetch_events":"https://pith.science/api/pith-number/YAOHT3YAN4BJN2T6TGUDENXI6V/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/action/storage_attestation","attest_author":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/action/author_attestation","sign_citation":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/action/citation_signature","submit_replication":"https://pith.science/pith/YAOHT3YAN4BJN2T6TGUDENXI6V/action/replication_record"}},"created_at":"2026-05-17T23:38:14.981419+00:00","updated_at":"2026-05-17T23:38:14.981419+00:00"}