{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:CTVKNTPZG7G2XIRXBJVDSVT54A","short_pith_number":"pith:CTVKNTPZ","schema_version":"1.0","canonical_sha256":"14eaa6cdf937cdaba2370a6a39567de015ee54eca0c505143d4d420dfa34f0e5","source":{"kind":"arxiv","id":"2311.03079","version":2},"attestation_state":"computed","paper":{"title":"CogVLM: Visual Expert for Pretrained Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A trainable visual expert module inserted into the attention and FFN layers of a frozen language model enables deep vision-language fusion.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Xu, Jiazheng Xu, Jie Tang, Ji Qi, Juanzi Li, Junhui Ji, Lei Zhao, Ming Ding, Qingsong Lv, Weihan Wang, Wenmeng Yu, Wenyi Hong, Xixuan Song, Yan Wang, Yuxiao Dong, Zhuoyi Yang","submitted_at":"2023-11-06T13:04:39Z","abstract_excerpt":"We introduce CogVLM, a powerful open-source visual language foundation model. Different from the popular shallow alignment method which maps image features into the input space of language model, CogVLM bridges the gap between the frozen pretrained language model and image encoder by a trainable visual expert module in the attention and FFN layers. As a result, CogVLM enables deep fusion of vision language features without sacrificing any performance on NLP tasks. CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks, including NoCaps, Flicker30k captioning, Ref"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2311.03079","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-11-06T13:04:39Z","cross_cats_sorted":[],"title_canon_sha256":"679fe85268225460d07d2179c1c3c8b521429885cfbc6b874c9f34e37b4130b4","abstract_canon_sha256":"9ed531cb4a2ee62bd4512e8535ec68ef02bf4f67385e61fe8e221a00b5f126b6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:51.022184Z","signature_b64":"o7kYcGp4gF9qm10ak5nbbkjYAW8mHvL04/UPsHqJjJMXWzjRh/f7K85fw64wIfNKEDfAOlitaYwnVVY/8ezlDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"14eaa6cdf937cdaba2370a6a39567de015ee54eca0c505143d4d420dfa34f0e5","last_reissued_at":"2026-05-17T23:38:51.021764Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:51.021764Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"CogVLM: Visual Expert for Pretrained Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A trainable visual expert module inserted into the attention and FFN layers of a frozen language model enables deep vision-language fusion.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Xu, Jiazheng Xu, Jie Tang, Ji Qi, Juanzi Li, Junhui Ji, Lei Zhao, Ming Ding, Qingsong Lv, Weihan Wang, Wenmeng Yu, Wenyi Hong, Xixuan Song, Yan Wang, Yuxiao Dong, Zhuoyi Yang","submitted_at":"2023-11-06T13:04:39Z","abstract_excerpt":"We introduce CogVLM, a powerful open-source visual language foundation model. Different from the popular shallow alignment method which maps image features into the input space of language model, CogVLM bridges the gap between the frozen pretrained language model and image encoder by a trainable visual expert module in the attention and FFN layers. As a result, CogVLM enables deep fusion of vision language features without sacrificing any performance on NLP tasks. CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks, including NoCaps, Flicker30k captioning, Ref"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks... surpassing or matching PaLI-X 55B.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The visual expert module can be inserted into the attention and FFN layers of any frozen pretrained language model without requiring changes to the original architecture or loss functions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"CogVLM adds a trainable visual expert inside frozen language model layers for deep vision-language fusion and reports state-of-the-art results on ten cross-modal benchmarks while preserving NLP performance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A trainable visual expert module inserted into the attention and FFN layers of a frozen language model enables deep vision-language fusion.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"be90eb575bcab3443ad5b5eeab07951b05affae30c186172073f5f508a425f23"},"source":{"id":"2311.03079","kind":"arxiv","version":2},"verdict":{"id":"abe2f049-9015-4958-aba1-fb6f3eaacd7b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T15:41:06.257046Z","strongest_claim":"CogVLM-17B achieves state-of-the-art performance on 10 classic cross-modal benchmarks... surpassing or matching PaLI-X 55B.","one_line_summary":"CogVLM adds a trainable visual expert inside frozen language model layers for deep vision-language fusion and reports state-of-the-art results on ten cross-modal benchmarks while preserving NLP performance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The visual expert module can be inserted into the attention and FFN layers of any frozen pretrained language model without requiring changes to the original architecture or loss functions.","pith_extraction_headline":"A trainable visual expert module inserted into the attention and FFN layers of a frozen language model enables deep vision-language fusion."},"references":{"count":33,"sample":[{"doi":"","year":null,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","ref_index":1,"cited_arxiv_id":"2308.01390","is_internal_anchor":true},{"doi":"","year":null,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":2,"cited_arxiv_id":"2308.12966","is_internal_anchor":true},{"doi":"","year":1989,"title":"Murel: Multimodal relational reasoning for visual ques- tion answering","work_id":"49d68897-f597-43d7-b9ab-c6810ce5a8f3","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic","work_id":"44525076-312a-4259-b79c-134cd7eeb297","ref_index":4,"cited_arxiv_id":"2306.15195","is_internal_anchor":true},{"doi":"","year":null,"title":"Universal captioner: Long-tail vision-and-language model training through content-style separation.arXiv preprint arXiv:2111.12727,","work_id":"8147134b-8245-480c-a294-d5382f4aa9aa","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":33,"snapshot_sha256":"930bafb70a094ebdbb2dfd3b02d5967ccde3eb04c48ea277d8612c0af9534ebb","internal_anchors":17},"formal_canon":{"evidence_count":2,"snapshot_sha256":"b7d08736e454ae758a45db8623526339624dff559713aac366aeccd11d30943f"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2311.03079","created_at":"2026-05-17T23:38:51.021832+00:00"},{"alias_kind":"arxiv_version","alias_value":"2311.03079v2","created_at":"2026-05-17T23:38:51.021832+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.03079","created_at":"2026-05-17T23:38:51.021832+00:00"},{"alias_kind":"pith_short_12","alias_value":"CTVKNTPZG7G2","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"CTVKNTPZG7G2XIRX","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"CTVKNTPZ","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":45,"internal_anchor_count":45,"sample":[{"citing_arxiv_id":"2405.19088","citing_title":"Cracking the Code of Juxtaposition: Can AI Models Understand the Humorous Contradictions","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2410.04509","citing_title":"ErrorRadar: Benchmarking Complex Mathematical Reasoning of Multimodal Large Language Models Via Error Detection","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2412.00727","citing_title":"Perturb and Recover: Fine-tuning for Effective Backdoor Removal from CLIP","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2412.20718","citing_title":"MM-MoralBench: A MultiModal Moral Evaluation Benchmark for Large Vision-Language Models","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2412.17574","citing_title":"HumanVBench: Probing Human-Centric Video Understanding in MLLMs with Automatically Synthesized Benchmarks","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2501.02955","citing_title":"MotionBench: Benchmarking and Improving Fine-grained Video Motion Understanding for Vision Language Models","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2503.23733","citing_title":"AdaMMS: Model Merging for Heterogeneous Multimodal Large Language Models with Unsupervised Coefficient Optimization","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2503.23137","citing_title":"When 'YES' Meets 'BUT': Can Large Models Comprehend Contradictory Humor Through Comparative Reasoning?","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2505.11809","citing_title":"From Street View to Visual Network: Mapping the Visibility of Urban Landmarks with Vision-Language Models","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17341","citing_title":"Single-Sample Black-Box Membership Inference Attack against Vision-Language Models via Cross-modal Semantic Alignment","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2408.04840","citing_title":"mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models","ref_index":250,"is_internal_anchor":true},{"citing_arxiv_id":"2406.08035","citing_title":"LVBench: An Extreme Long Video Understanding Benchmark","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2504.14239","citing_title":"InfiGUI-R1: Advancing Multimodal GUI Agents from Reactive Actors to Deliberative Reasoners","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2510.21122","citing_title":"NoisyGRPO: Incentivizing Multimodal CoT Reasoning via Noise Injection and Bayesian Estimation","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2411.04996","citing_title":"Mixture-of-Transformers: A Sparse and Scalable Architecture for Multi-Modal Foundation Models","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2501.00321","citing_title":"OCRBench v2: An Improved Benchmark for Evaluating Large Multimodal Models on Visual Text Localization and Reasoning","ref_index":144,"is_internal_anchor":true},{"citing_arxiv_id":"2401.10935","citing_title":"SeeClick: Harnessing GUI Grounding for Advanced Visual GUI Agents","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2403.18814","citing_title":"Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2408.16500","citing_title":"CogVLM2: Visual Language Models for Image and Video Understanding","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2410.10594","citing_title":"VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2410.17434","citing_title":"LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2412.21059","citing_title":"VisionReward: Fine-Grained Multi-Dimensional Human Preference Learning for Image and Video Generation","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2411.10442","citing_title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization","ref_index":100,"is_internal_anchor":true},{"citing_arxiv_id":"2403.09611","citing_title":"MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2306.13549","citing_title":"A Survey on Multimodal Large Language Models","ref_index":77,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A","json":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A.json","graph_json":"https://pith.science/api/pith-number/CTVKNTPZG7G2XIRXBJVDSVT54A/graph.json","events_json":"https://pith.science/api/pith-number/CTVKNTPZG7G2XIRXBJVDSVT54A/events.json","paper":"https://pith.science/paper/CTVKNTPZ"},"agent_actions":{"view_html":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A","download_json":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A.json","view_paper":"https://pith.science/paper/CTVKNTPZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2311.03079&json=true","fetch_graph":"https://pith.science/api/pith-number/CTVKNTPZG7G2XIRXBJVDSVT54A/graph.json","fetch_events":"https://pith.science/api/pith-number/CTVKNTPZG7G2XIRXBJVDSVT54A/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A/action/storage_attestation","attest_author":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A/action/author_attestation","sign_citation":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A/action/citation_signature","submit_replication":"https://pith.science/pith/CTVKNTPZG7G2XIRXBJVDSVT54A/action/replication_record"}},"created_at":"2026-05-17T23:38:51.021832+00:00","updated_at":"2026-05-17T23:38:51.021832+00:00"}