{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:AT6IIIBKMGQXJKXB23WLN6VKNG","short_pith_number":"pith:AT6IIIBK","schema_version":"1.0","canonical_sha256":"04fc84202a61a174aae1d6ecb6faaa69b77c7291d92966f229d694ee8862069a","source":{"kind":"arxiv","id":"2304.14178","version":3},"attestation_state":"computed","paper":{"title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Anwen Hu, Chenliang Li, Fei Huang, Guohai Xu, Haiyang Xu, Hehong Chen, Jiabo Ye, Jingren Zhou, Ji Zhang, Junfeng Tian, Junyang Wang, Ming Yan, Pengcheng Shi, Qinghao Ye, Qi Qian, Yaya Shi, Yiyang Zhou, Yuanhong Xu","submitted_at":"2023-04-27T13:27:01Z","abstract_excerpt":"Large language models (LLMs) have demonstrated impressive zero-shot abilities on a variety of open-ended tasks, while recent research has also explored the use of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl, a novel training paradigm that equips LLMs with multi-modal abilities through modularized learning of foundation LLM, a visual knowledge module, and a visual abstractor module. This approach can support multiple modalities and facilitate diverse unimodal and multimodal abilities through modality collaboration. The training paradigm of mPLUG-Owl involves a two-sta"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2304.14178","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2023-04-27T13:27:01Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"3c5d275ad162ad38d5bf8cf676a8b0cfe0b40304c324518e72226a64d6c1fbc9","abstract_canon_sha256":"4d68b9992c87ab7c6a1e1216581044c9ca2c24d1b0c9f55e555a111729d8318e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:34:59.992467Z","signature_b64":"HFnq1SM8xoGf9WwmoakxyIN90UdO1+7Z4IOniqgY+yM74SjQTGPTS9ciRPvXeO4zsPlKVwirk+mGG7lqvChjCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"04fc84202a61a174aae1d6ecb6faaa69b77c7291d92966f229d694ee8862069a","last_reissued_at":"2026-05-18T02:34:59.992030Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:34:59.992030Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Anwen Hu, Chenliang Li, Fei Huang, Guohai Xu, Haiyang Xu, Hehong Chen, Jiabo Ye, Jingren Zhou, Ji Zhang, Junfeng Tian, Junyang Wang, Ming Yan, Pengcheng Shi, Qinghao Ye, Qi Qian, Yaya Shi, Yiyang Zhou, Yuanhong Xu","submitted_at":"2023-04-27T13:27:01Z","abstract_excerpt":"Large language models (LLMs) have demonstrated impressive zero-shot abilities on a variety of open-ended tasks, while recent research has also explored the use of LLMs for multi-modal generation. In this study, we introduce mPLUG-Owl, a novel training paradigm that equips LLMs with multi-modal abilities through modularized learning of foundation LLM, a visual knowledge module, and a visual abstractor module. This approach can support multiple modalities and facilitate diverse unimodal and multimodal abilities through modality collaboration. The training paradigm of mPLUG-Owl involves a two-sta"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2304.14178","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2304.14178","created_at":"2026-05-18T02:34:59.992097+00:00"},{"alias_kind":"arxiv_version","alias_value":"2304.14178v3","created_at":"2026-05-18T02:34:59.992097+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2304.14178","created_at":"2026-05-18T02:34:59.992097+00:00"},{"alias_kind":"pith_short_12","alias_value":"AT6IIIBKMGQX","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"AT6IIIBKMGQXJKXB","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"AT6IIIBK","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":72,"internal_anchor_count":51,"sample":[{"citing_arxiv_id":"2403.10559","citing_title":"Generative Models and Connected and Automated Vehicles: A Survey in Exploring the Intersection of Transportation and AI","ref_index":102,"is_internal_anchor":true},{"citing_arxiv_id":"2410.14702","citing_title":"Polymath: A Challenging Multi-modal Mathematical Reasoning Benchmark","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2411.16771","citing_title":"VidHal: Benchmarking Temporal Hallucinations in Vision LLMs","ref_index":61,"is_internal_anchor":true},{"citing_arxiv_id":"2411.18111","citing_title":"When Large Vision-Language Models Meet Person Re-Identification","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2412.00131","citing_title":"Open-Sora Plan: Open-Source Large Video Generation Model","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2412.17574","citing_title":"HumanVBench: Probing Human-Centric Video Understanding in MLLMs with Automatically Synthesized Benchmarks","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2501.05067","citing_title":"LLaVA-Octopus: Unlocking Instruction-Driven Adaptive Projector Fusion for Video Understanding","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2504.07148","citing_title":"Q-Agent: Quality-Driven Chain-of-Thought Image Restoration Agent through Robust Multimodal Large Language Model","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2509.15435","citing_title":"ORCA: An Agentic Reasoning Framework for Hallucination and Adversarial Robustness in Vision-Language Models","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2603.25120","citing_title":"DFLOP: A Data-driven Framework for Multimodal LLM Training Pipeline Optimization","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16409","citing_title":"Multilingual OCR-Aware Fine-Tuning and Prompt-Guided Chain-of-Thought Reasoning for Multimodal Large Language Models","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2406.03520","citing_title":"VideoPhy: Evaluating Physical Commonsense for Video Generation","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19950","citing_title":"AffectVerse: Emotional World Models for Multimodal Affective Computing","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2408.04840","citing_title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models","ref_index":261,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06435","citing_title":"A Comprehensive Overview of Large Language Models","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2505.21472","citing_title":"Mitigating Hallucination in Large Vision-Language Models via Adaptive Attention Calibration","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2506.09522","citing_title":"Revisit What You See: Revealing Visual Semantics in Vision Tokens to Guide LVLM Decoding","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2507.09861","citing_title":"A Survey on MLLM-based Visually Rich Document Understanding: Methods, Challenges, and Emerging Trends","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2402.03766","citing_title":"MobileVLM V2: Faster and Stronger Baseline for Vision Language Model","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2509.15435","citing_title":"ORCA: An Agentic Reasoning Framework for Hallucination and Adversarial Robustness in Vision-Language Models","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2311.04257","citing_title":"mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2310.00754","citing_title":"Analyzing and Mitigating Object Hallucination in Large Vision-Language Models","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2311.17005","citing_title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark","ref_index":94,"is_internal_anchor":true},{"citing_arxiv_id":"2311.12871","citing_title":"An Embodied Generalist Agent in 3D World","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2402.11411","citing_title":"Aligning Modalities in Vision Large Language Models via Preference Fine-tuning","ref_index":179,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG","json":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG.json","graph_json":"https://pith.science/api/pith-number/AT6IIIBKMGQXJKXB23WLN6VKNG/graph.json","events_json":"https://pith.science/api/pith-number/AT6IIIBKMGQXJKXB23WLN6VKNG/events.json","paper":"https://pith.science/paper/AT6IIIBK"},"agent_actions":{"view_html":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG","download_json":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG.json","view_paper":"https://pith.science/paper/AT6IIIBK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2304.14178&json=true","fetch_graph":"https://pith.science/api/pith-number/AT6IIIBKMGQXJKXB23WLN6VKNG/graph.json","fetch_events":"https://pith.science/api/pith-number/AT6IIIBKMGQXJKXB23WLN6VKNG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG/action/storage_attestation","attest_author":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG/action/author_attestation","sign_citation":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG/action/citation_signature","submit_replication":"https://pith.science/pith/AT6IIIBKMGQXJKXB23WLN6VKNG/action/replication_record"}},"created_at":"2026-05-18T02:34:59.992097+00:00","updated_at":"2026-05-18T02:34:59.992097+00:00"}