{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:OGXGRTKCWZTF6E46FCQUPHGSCH","short_pith_number":"pith:OGXGRTKC","schema_version":"1.0","canonical_sha256":"71ae68cd42b6665f139e28a1479cd211e56ebd782f3efa8b38a36b487ce38b84","source":{"kind":"arxiv","id":"2309.15112","version":5},"attestation_state":"computed","paper":{"title":"InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"InternLM-XComposer generates articles with automatically inserted context-appropriate images while achieving state-of-the-art results on vision-language benchmarks.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Wang, Chao Xu, Conghui He, Dahua Lin, Hang Yan, Haodong Duan, Jiaqi Wang, Jingwen Li, Kai Chen, Linke Ouyang, Pan Zhang, Shuangrui Ding, Songyang Zhang, Wei Li, Wenwei Zhang, Xiaoyi Dong, Xingcheng Zhang, Xinyue Zhang, Yuhang Cao, Yu Qiao, Zhiyuan Zhao","submitted_at":"2023-09-26T17:58:20Z","abstract_excerpt":"We propose InternLM-XComposer, a vision-language large model that enables advanced image-text comprehension and composition. The innovative nature of our model is highlighted by three appealing properties: 1) Interleaved Text-Image Composition: InternLM-XComposer can effortlessly generate coherent and contextual articles that seamlessly integrate images, providing a more engaging and immersive reading experience. Simply provide a writing instruction, and our system will generate the corresponding manuscript. It can intelligently identify the areas in the text where images would enhance the con"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2309.15112","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-09-26T17:58:20Z","cross_cats_sorted":[],"title_canon_sha256":"eda8b0b16f0169266a3c74c46c739ab855ef48ca4e4863a8abf7e26df1a59fc5","abstract_canon_sha256":"6fd5a608d0dbd3dbb100cb2814ac5a6c4638bdffe3cc60835e63055b9deb6d06"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.920481Z","signature_b64":"9UpcmwReVn46KEA8Ir9e908hp4kSvVYwXiYwLlqmXlnQsGBEDrm95fBEfz4y7ri2gQUJk3bFwNCfRrvDXBrNCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"71ae68cd42b6665f139e28a1479cd211e56ebd782f3efa8b38a36b487ce38b84","last_reissued_at":"2026-05-17T23:38:13.920003Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.920003Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"InternLM-XComposer generates articles with automatically inserted context-appropriate images while achieving state-of-the-art results on vision-language benchmarks.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Wang, Chao Xu, Conghui He, Dahua Lin, Hang Yan, Haodong Duan, Jiaqi Wang, Jingwen Li, Kai Chen, Linke Ouyang, Pan Zhang, Shuangrui Ding, Songyang Zhang, Wei Li, Wenwei Zhang, Xiaoyi Dong, Xingcheng Zhang, Xinyue Zhang, Yuhang Cao, Yu Qiao, Zhiyuan Zhao","submitted_at":"2023-09-26T17:58:20Z","abstract_excerpt":"We propose InternLM-XComposer, a vision-language large model that enables advanced image-text comprehension and composition. The innovative nature of our model is highlighted by three appealing properties: 1) Interleaved Text-Image Composition: InternLM-XComposer can effortlessly generate coherent and contextual articles that seamlessly integrate images, providing a more engaging and immersive reading experience. Simply provide a writing instruction, and our system will generate the corresponding manuscript. It can intelligently identify the areas in the text where images would enhance the con"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our model consistently achieves state-of-the-art results across various mainstream benchmarks for vision-language foundational models, including MME Benchmark, MMBench, MMBench-CN, Seed-Bench, CCBench, QBench and Tiny LVLM.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the custom human-plus-GPT-4V evaluation procedure for text-image composition reliably measures quality and that the training data strategies produce genuine comprehension rather than benchmark overfitting.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"InternLM-XComposer generates articles with seamlessly integrated images and achieves state-of-the-art results on vision-language benchmarks including MME, MMBench, and Seed-Bench.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"InternLM-XComposer generates articles with automatically inserted context-appropriate images while achieving state-of-the-art results on vision-language benchmarks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"f8ed9f971dae1f26519a8fee7dac15b79d8cdf98b72ee587993f77d63660f803"},"source":{"id":"2309.15112","kind":"arxiv","version":5},"verdict":{"id":"2d6ca698-bfcf-41ca-9294-b8eb124358d4","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T13:42:12.353642Z","strongest_claim":"Our model consistently achieves state-of-the-art results across various mainstream benchmarks for vision-language foundational models, including MME Benchmark, MMBench, MMBench-CN, Seed-Bench, CCBench, QBench and Tiny LVLM.","one_line_summary":"InternLM-XComposer generates articles with seamlessly integrated images and achieves state-of-the-art results on vision-language benchmarks including MME, MMBench, and Seed-Bench.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the custom human-plus-GPT-4V evaluation procedure for text-image composition reliably measures quality and that the training data strategies produce genuine comprehension rather than benchmark overfitting.","pith_extraction_headline":"InternLM-XComposer generates articles with automatically inserted context-appropriate images while achieving state-of-the-art results on vision-language benchmarks."},"references":{"count":119,"sample":[{"doi":"","year":null,"title":"Flamingo: a visual language model for few-shot learning,","work_id":"15887c25-c51f-4381-9fe0-7afe4a3002b7","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2015,"title":"Lawrence Zitnick, and Devi Parikh","work_id":"1b370a2a-fb0c-43c7-87c5-2ee5a7b8d1f5","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Openflamingo: An open- source framework for training large autoregressive vision- language models","work_id":"e7a6d057-b3ee-4171-89e8-73ba48d4a29d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Qwen-vl: A frontier large vision-language model with versatile abilities","work_id":"6bae39c3-05dd-4dc6-a806-3ff8fe32dfea","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Baichuan 2: Open large-scale language models","work_id":"532f87f6-8b4c-47b9-9df3-fb6dfda2b80a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":119,"snapshot_sha256":"a5e1491bde591293eea0e6dacbb6f3db36fc34a8260aba792a03cfc5b381db40","internal_anchors":10},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2309.15112","created_at":"2026-05-17T23:38:13.920078+00:00"},{"alias_kind":"arxiv_version","alias_value":"2309.15112v5","created_at":"2026-05-17T23:38:13.920078+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2309.15112","created_at":"2026-05-17T23:38:13.920078+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":17,"sample":[{"citing_arxiv_id":"2501.01957","citing_title":"VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2407.03320","citing_title":"InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output","ref_index":174,"is_internal_anchor":true},{"citing_arxiv_id":"2401.10935","citing_title":"SeeClick: Harnessing GUI Grounding for Advanced Visual GUI Agents","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2403.18814","citing_title":"Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2401.16420","citing_title":"InternLM-XComposer2: Mastering Free-form Text-Image Composition and Comprehension in Vision-Language Large Model","ref_index":96,"is_internal_anchor":true},{"citing_arxiv_id":"2511.21471","citing_title":"SpatialBench: Benchmarking Multimodal Large Language Models for Spatial Cognition","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2501.04001","citing_title":"Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos","ref_index":112,"is_internal_anchor":true},{"citing_arxiv_id":"2408.13257","citing_title":"MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?","ref_index":72,"is_internal_anchor":true},{"citing_arxiv_id":"2401.15947","citing_title":"MoE-LLaVA: Mixture of Experts for Large Vision-Language Models","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2404.14396","citing_title":"SEED-X: Multimodal Models with Unified Multi-granularity Comprehension and Generation","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2312.14238","citing_title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","ref_index":178,"is_internal_anchor":true},{"citing_arxiv_id":"2311.12793","citing_title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2403.20330","citing_title":"Are We on the Right Way for Evaluating Large Vision-Language Models?","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06281","citing_title":"MMBench: Is Your Multi-modal Model an All-around Player?","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2503.01743","citing_title":"Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10973","citing_title":"CFMS: A Coarse-to-Fine Multimodal Synthesis Framework for Enhanced Tabular Reasoning","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13565","citing_title":"UHR-BAT: Budget-Aware Token Compression Vision-Language model for Ultra-High-Resolution Remote Sensing","ref_index":23,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH","json":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH.json","graph_json":"https://pith.science/api/pith-number/OGXGRTKCWZTF6E46FCQUPHGSCH/graph.json","events_json":"https://pith.science/api/pith-number/OGXGRTKCWZTF6E46FCQUPHGSCH/events.json","paper":"https://pith.science/paper/OGXGRTKC"},"agent_actions":{"view_html":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH","download_json":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH.json","view_paper":"https://pith.science/paper/OGXGRTKC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2309.15112&json=true","fetch_graph":"https://pith.science/api/pith-number/OGXGRTKCWZTF6E46FCQUPHGSCH/graph.json","fetch_events":"https://pith.science/api/pith-number/OGXGRTKCWZTF6E46FCQUPHGSCH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH/action/storage_attestation","attest_author":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH/action/author_attestation","sign_citation":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH/action/citation_signature","submit_replication":"https://pith.science/pith/OGXGRTKCWZTF6E46FCQUPHGSCH/action/replication_record"}},"created_at":"2026-05-17T23:38:13.920078+00:00","updated_at":"2026-05-17T23:38:13.920078+00:00"}