{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:M56RT635O3UJWLV6ECFPXRJDOT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d423f6009012c6e415551ba5b524f51d92dd05608cf7355693107cba48281c06","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-11-13T18:59:47Z","title_canon_sha256":"264902f5b7ca56be994ab61c7b18762656d7555d64a3e668d98375fb3664e00b"},"schema_version":"1.0","source":{"id":"2311.07575","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2311.07575","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2311.07575v1","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.07575","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"M56RT635O3UJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"M56RT635O3UJWLV6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"M56RT635","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:bcb029024d7d81ce4c34968e55ce38fb610b14322c2a19197634f30e209d7144","target":"graph","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Based on our proposed joint mixing, SPHINX exhibits superior multi-modal understanding capabilities on a wide range of applications."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that directly integrating weights from LLMs trained on real-world and synthetic data will efficiently incorporate diverse semantics with favorable robustness without introducing conflicts or degrading performance."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SPHINX improves multi-modal LLMs through joint mixing of weights, tasks, and visual embeddings from varied sources to achieve stronger alignment and multi-purpose capabilities."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Mixing weights from real-world and synthetic LLMs with varied tasks and visual embeddings produces a single versatile multi-modal model."}],"snapshot_sha256":"afe52fe8762e8dc4f201c84a3da32db1823ce490ae268e58a691dba7f7026e0e"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"59ad857b0f1fae70083af3cc60fdce53ce3fa971177ccf32280f6a86c72f43b9"},"paper":{"abstract_excerpt":"We present SPHINX, a versatile multi-modal large language model (MLLM) with a joint mixing of model weights, tuning tasks, and visual embeddings. First, for stronger vision-language alignment, we unfreeze the large language model (LLM) during pre-training, and introduce a weight mix strategy between LLMs trained by real-world and synthetic data. By directly integrating the weights from two domains, the mixed LLM can efficiently incorporate diverse semantics with favorable robustness. Then, to enable multi-purpose capabilities, we mix a variety of tasks for joint visual instruction tuning, and ","authors_text":"Chen Lin, Chris Liu, Han Qiu, Han Xiao, Hongsheng Li, Jiaming Han, Keqin Chen, Longtian Qiu, Peng Gao, Renrui Zhang, Siyuan Huang, Wenqi Shao, Xuming He, Yichi Zhang, Yu Qiao, Ziyi Lin","cross_cats":["cs.AI","cs.CL","cs.LG"],"headline":"Mixing weights from real-world and synthetic LLMs with varied tasks and visual embeddings produces a single versatile multi-modal model.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-11-13T18:59:47Z","title":"SPHINX: The Joint Mixing of Weights, Tasks, and Visual Embeddings for Multi-modal Large Language Models"},"references":{"count":45,"internal_anchors":22,"resolved_work":45,"sample":[{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al","work_id":"c78cbfc8-8ead-4365-9b3c-098dabd131d4","year":1901},{"cited_arxiv_id":"2310.09478","doi":"","is_internal_anchor":true,"ref_index":3,"title":"MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning","work_id":"fb62cd1b-3991-40be-a987-3cfa5772b5b5","year":2023},{"cited_arxiv_id":"2305.06500","doi":"","is_internal_anchor":true,"ref_index":4,"title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning","work_id":"f3aac728-ded0-4e55-aa9e-4a1635d4313d","year":null},{"cited_arxiv_id":"1810.04805","doi":"","is_internal_anchor":true,"ref_index":5,"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","year":null}],"snapshot_sha256":"0d11f8d387a29d6782b03abd127b414dbeca1775174c43280f8d00cb952cefba"},"source":{"id":"2311.07575","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T02:58:20.956133Z","id":"82f64f73-d45f-4971-b152-2ea38f5c8154","model_set":{"reader":"grok-4.3"},"one_line_summary":"SPHINX improves multi-modal LLMs through joint mixing of weights, tasks, and visual embeddings from varied sources to achieve stronger alignment and multi-purpose capabilities.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Mixing weights from real-world and synthetic LLMs with varied tasks and visual embeddings produces a single versatile multi-modal model.","strongest_claim":"Based on our proposed joint mixing, SPHINX exhibits superior multi-modal understanding capabilities on a wide range of applications.","weakest_assumption":"The assumption that directly integrating weights from LLMs trained on real-world and synthetic data will efficiently incorporate diverse semantics with favorable robustness without introducing conflicts or degrading performance."}},"verdict_id":"82f64f73-d45f-4971-b152-2ea38f5c8154"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f75e7f578b2a9351655df38d399cb4160f7f5b41f073d2c92f04365e3b2dfa07","target":"record","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d423f6009012c6e415551ba5b524f51d92dd05608cf7355693107cba48281c06","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-11-13T18:59:47Z","title_canon_sha256":"264902f5b7ca56be994ab61c7b18762656d7555d64a3e668d98375fb3664e00b"},"schema_version":"1.0","source":{"id":"2311.07575","kind":"arxiv","version":1}},"canonical_sha256":"677d19fb7d76e89b2ebe208afbc52374fbff19730c04592807ecbb5291149738","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"677d19fb7d76e89b2ebe208afbc52374fbff19730c04592807ecbb5291149738","first_computed_at":"2026-05-17T23:38:15.321821Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:15.321821Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"U9oBUu6Ptn0MC7adC8w2DZcykhGrTgX2KleI76/ltcDwi5gPohB4tGlXAmC4pl7utYIrajEIPtVvs6vwOOE8Bg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:15.322497Z","signed_message":"canonical_sha256_bytes"},"source_id":"2311.07575","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f75e7f578b2a9351655df38d399cb4160f7f5b41f073d2c92f04365e3b2dfa07","sha256:bcb029024d7d81ce4c34968e55ce38fb610b14322c2a19197634f30e209d7144"],"state_sha256":"85720bf88e7caac5e8a972d9ce1b31cd213b2b9c6c92699ef64da875a343200e"}