{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:2KOQ3SMADRV4XAYY3DLSLG76FH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"923a9976303f3c648273dba6d0d92803fad89135dc3e2e95942bff3913bb9ceb","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-03-14T17:51:32Z","title_canon_sha256":"98612f0506b0805073aeaaeaf93f8af49f3f2ccba777087e6dd48a1edd8d0f0a"},"schema_version":"1.0","source":{"id":"2403.09611","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2403.09611","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2403.09611v4","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2403.09611","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"2KOQ3SMADRV4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2KOQ3SMADRV4XAYY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2KOQ3SMA","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:e12211cc311c9f374d3cc2b39f895396367de4f73499d4c0a0f6a5febe80131a","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"For large-scale multimodal pre-training, a careful mix of image-caption, interleaved image-text, and text-only data is crucial for achieving state-of-the-art few-shot results across multiple benchmarks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the ablations performed are comprehensive enough to isolate the true importance of data composition and image encoder choices without confounding effects from untested interactions or hyperparameter choices."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MM1 models achieve state-of-the-art few-shot multimodal results by pre-training on a careful mix of image-caption, interleaved, and text-only data with optimized image encoders."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A careful mix of image-caption, interleaved image-text, and text-only data during pre-training is crucial for state-of-the-art few-shot results in multimodal large language models."}],"snapshot_sha256":"bcd878f5465a8e225a182212643fabbd3819a335f32611ac0995bb0b3e6616af"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"4bbefda8724716d1fdebfb7f51abf7fef21ba16a801f3cb605ec55ff1bf66c1a"},"paper":{"abstract_excerpt":"In this work, we discuss building performant Multimodal Large Language Models (MLLMs). In particular, we study the importance of various architecture components and data choices. Through careful and comprehensive ablations of the image encoder, the vision language connector, and various pre-training data choices, we identified several crucial design lessons. For example, we demonstrate that for large-scale multimodal pre-training using a careful mix of image-caption, interleaved image-text, and text-only data is crucial for achieving state-of-the-art (SOTA) few-shot results across multiple ben","authors_text":"Alexander Toshev, Ankur Jain, Anton Belyi, Aonan Zhang, Bowen Zhang, Brandon McKinzie, Chong Wang, Dhruti Shah, Doug Kang, Floris Weers, Futang Peng, Guoli Yin, Haotian Zhang, Hongyu H\\`e, Jean-Philippe Fauconnier, Jianyu Wang, Karanjeet Singh, Mark Lee, Max Schwarzer, Nan Du, Peter Grasch, Philipp Dufter, Ruoming Pang, Sam Dodge, Sam Wiseman, Tao Lei, Tom Gunter, Xiang Kong, Xianzhi Du, Yinfei Yang, Zhe Gan, Zirui Wang","cross_cats":["cs.CL","cs.LG"],"headline":"A careful mix of image-caption, interleaved image-text, and text-only data during pre-training is crucial for state-of-the-art few-shot results in multimodal large language models.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-03-14T17:51:32Z","title":"MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training"},"references":{"count":137,"internal_anchors":47,"resolved_work":137,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"In: ICCV (2019)","work_id":"7c1b8382-b9e1-44a0-a966-63773acaec5c","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Alayrac, J.B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., Ring, R., Rutherford, E., Cabi, S., Han, T., Gong, Z., Samangooei, S., Monteiro,","work_id":"bce27169-4fab-4916-967e-1f87eeac9fdb","year":2022},{"cited_arxiv_id":"2308.01390","doi":"","is_internal_anchor":true,"ref_index":4,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","year":2023},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023}],"snapshot_sha256":"7ef4e9ade704b04fc25f5181b2f44cfb9f6bf1df6c26572b44364f03eed55d5a"},"source":{"id":"2403.09611","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-16T04:01:36.939836Z","id":"e8a6560f-96b3-48fc-89ba-e052b67b750a","model_set":{"reader":"grok-4.3"},"one_line_summary":"MM1 models achieve state-of-the-art few-shot multimodal results by pre-training on a careful mix of image-caption, interleaved, and text-only data with optimized image encoders.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A careful mix of image-caption, interleaved image-text, and text-only data during pre-training is crucial for state-of-the-art few-shot results in multimodal large language models.","strongest_claim":"For large-scale multimodal pre-training, a careful mix of image-caption, interleaved image-text, and text-only data is crucial for achieving state-of-the-art few-shot results across multiple benchmarks.","weakest_assumption":"That the ablations performed are comprehensive enough to isolate the true importance of data composition and image encoder choices without confounding effects from untested interactions or hyperparameter choices."}},"verdict_id":"e8a6560f-96b3-48fc-89ba-e052b67b750a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:03f348901bd42b350bb9fb8af319033bbf5218704ac05025b89f5e6cd07f24d2","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"923a9976303f3c648273dba6d0d92803fad89135dc3e2e95942bff3913bb9ceb","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-03-14T17:51:32Z","title_canon_sha256":"98612f0506b0805073aeaaeaf93f8af49f3f2ccba777087e6dd48a1edd8d0f0a"},"schema_version":"1.0","source":{"id":"2403.09611","kind":"arxiv","version":4}},"canonical_sha256":"d29d0dc9801c6bcb8318d8d7259bfe29e407418722613cd139c0b9faa3e3b0fc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d29d0dc9801c6bcb8318d8d7259bfe29e407418722613cd139c0b9faa3e3b0fc","first_computed_at":"2026-05-17T23:38:49.147551Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.147551Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"L+c/1XOwsz1+wEq1yU5lcDksSh9P63OzsYGSDTUmiqWanjwyTdzMXeINbqp/ye3XR/jEy/nzb71Um2EHYgP8AA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.148244Z","signed_message":"canonical_sha256_bytes"},"source_id":"2403.09611","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:03f348901bd42b350bb9fb8af319033bbf5218704ac05025b89f5e6cd07f24d2","sha256:e12211cc311c9f374d3cc2b39f895396367de4f73499d4c0a0f6a5febe80131a"],"state_sha256":"e9734a8b378ee086f5a1ff6667fd5ab41cb95982213550eacc73a8bb38a962a4"}