{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:KD37OJPHLG7ECMVMI4CUQI777Z","short_pith_number":"pith:KD37OJPH","canonical_record":{"source":{"id":"2605.12960","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T03:50:54Z","cross_cats_sorted":[],"title_canon_sha256":"1373ba8a1da4cf3771021660241b7d85437ab61e60bb136f615a8630a94fcbb4","abstract_canon_sha256":"7fcbce155f5a653bc4d9cb8f097abf3f8237354f11cd20731ddaa486a6d9045c"},"schema_version":"1.0"},"canonical_sha256":"50f7f725e759be4132ac47054823fffe45fae4028a59fa662ff78cdd107f01a5","source":{"kind":"arxiv","id":"2605.12960","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12960","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12960v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12960","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"KD37OJPHLG7E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"KD37OJPHLG7ECMVM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"KD37OJPH","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:KD37OJPHLG7ECMVMI4CUQI777Z","target":"record","payload":{"canonical_record":{"source":{"id":"2605.12960","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T03:50:54Z","cross_cats_sorted":[],"title_canon_sha256":"1373ba8a1da4cf3771021660241b7d85437ab61e60bb136f615a8630a94fcbb4","abstract_canon_sha256":"7fcbce155f5a653bc4d9cb8f097abf3f8237354f11cd20731ddaa486a6d9045c"},"schema_version":"1.0"},"canonical_sha256":"50f7f725e759be4132ac47054823fffe45fae4028a59fa662ff78cdd107f01a5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:09.204848Z","signature_b64":"0m9f38x5ck4Q9hdWOvZc/FqzimMcUyy5BIUXIGbkEd5SphPXbqSxXs9zfXiy5ZuJVODIdxSclgqX13+rX7XaDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"50f7f725e759be4132ac47054823fffe45fae4028a59fa662ff78cdd107f01a5","last_reissued_at":"2026-05-18T03:09:09.204085Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:09.204085Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.12960","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YiGl5NOB1ED5rhq0I7s1lIHEBhsAcc65EQekXzXKGTS5FR1nkDR9JU8EFQ9U/5wdkamD/GTFkyDlBLPgwJlVBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T16:54:25.525955Z"},"content_sha256":"4422bc2064729a1223176478b96e5050deef504fc6714ac34ce4bab8090d9286","schema_version":"1.0","event_id":"sha256:4422bc2064729a1223176478b96e5050deef504fc6714ac34ce4bab8090d9286"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:KD37OJPHLG7ECMVMI4CUQI777Z","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DiM\\textsuperscript{3}: Bridging Multilingual and Multimodal Models via Direction- and Magnitude-Aware Merging","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Selective merging of direction- and magnitude-aware residual updates injects multilingual capability into multimodal models without training.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Daling Wang, Ercong Nie, Hinrich Sch\\\"utze, Mengjie Zhao, Mingyang Wang, Shi Feng, Xiaocui Yang, Yongkang Liu, Zijing Wang","submitted_at":"2026-05-13T03:50:54Z","abstract_excerpt":"Towards more general and human-like intelligence, large language models should seamlessly integrate both multilingual and multimodal capabilities; however, extending an existing multimodal model to many languages typically requires expensive multilingual multimodal data construction and repeated end-to-end retraining. We study a training-free alternative: injecting multilingual capability into an existing multimodal model by composing residual updates in the shared language model backbone. The key challenge is that multilingual and multimodal updates are heterogeneous, reflecting different fun"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments on multilingual benchmarks in both text-only and vision-language settings, covering 57 languages across LLaVA- and Qwen-based backbones, show that DiM3 consistently outperforms existing merging baselines, substantially improves multilingual performance over the original multimodal model, and remains competitive with dedicated multilingual multimodal fine-tuning while largely retaining general multimodal ability.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that multilingual and multimodal residual updates are heterogeneous in a way that can be selectively composed per parameter dimension using direction and magnitude awareness without unintended interference in the shared backbone.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"DiM3 merges multilingual and multimodal model updates in a direction- and magnitude-aware way to enhance multilingual performance in vision-language models while preserving original multimodal abilities.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Selective merging of direction- and magnitude-aware residual updates injects multilingual capability into multimodal models without training.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"cbdb4a268a855c88063361c87ba78ed90c65cea8d1a1eff33b5223b4ca5d577b"},"source":{"id":"2605.12960","kind":"arxiv","version":1},"verdict":{"id":"0c13a54d-4db7-4fa8-88fa-b743072e50ed","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T20:20:37.605222Z","strongest_claim":"Experiments on multilingual benchmarks in both text-only and vision-language settings, covering 57 languages across LLaVA- and Qwen-based backbones, show that DiM3 consistently outperforms existing merging baselines, substantially improves multilingual performance over the original multimodal model, and remains competitive with dedicated multilingual multimodal fine-tuning while largely retaining general multimodal ability.","one_line_summary":"DiM3 merges multilingual and multimodal model updates in a direction- and magnitude-aware way to enhance multilingual performance in vision-language models while preserving original multimodal abilities.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that multilingual and multimodal residual updates are heterogeneous in a way that can be selectively composed per parameter dimension using direction and magnitude awareness without unintended interference in the shared backbone.","pith_extraction_headline":"Selective merging of direction- and magnitude-aware residual updates injects multilingual capability into multimodal models without training."},"references":{"count":71,"sample":[{"doi":"","year":2025,"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","ref_index":1,"cited_arxiv_id":"2601.03267","is_internal_anchor":true},{"doi":"","year":2023,"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","ref_index":2,"cited_arxiv_id":"2312.11805","is_internal_anchor":true},{"doi":"","year":2025,"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","ref_index":3,"cited_arxiv_id":"2508.18265","is_internal_anchor":true},{"doi":"","year":2025,"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","ref_index":4,"cited_arxiv_id":"2505.09388","is_internal_anchor":true},{"doi":"","year":2022,"title":"xgqa: Cross-lingual visual question answering","work_id":"93a68a65-d177-45cb-9504-73968ce065d6","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":71,"snapshot_sha256":"8cf55dd53e880d80fcb9d8010606f3391f71c8e3b6bcbb91f4ed89f6acfc1866","internal_anchors":10},"formal_canon":{"evidence_count":2,"snapshot_sha256":"66d321191886a75e81fbc8cce96f6159357a43a4c9b375152ade61254dcde285"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0c13a54d-4db7-4fa8-88fa-b743072e50ed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"f3iT23Mv1DqlmOzmyx7w2JpJ+8045h/ckirdsBI19BGJONVqbOXt+X3m58TfA3U1isWKKzhskWdiaX+csO4uAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T16:54:25.526954Z"},"content_sha256":"4bd7fcc40ca26de66fd483288f4aea2225ac65a5d6d0ca3bf5f4c25de910091e","schema_version":"1.0","event_id":"sha256:4bd7fcc40ca26de66fd483288f4aea2225ac65a5d6d0ca3bf5f4c25de910091e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/KD37OJPHLG7ECMVMI4CUQI777Z/bundle.json","state_url":"https://pith.science/pith/KD37OJPHLG7ECMVMI4CUQI777Z/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/KD37OJPHLG7ECMVMI4CUQI777Z/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T16:54:25Z","links":{"resolver":"https://pith.science/pith/KD37OJPHLG7ECMVMI4CUQI777Z","bundle":"https://pith.science/pith/KD37OJPHLG7ECMVMI4CUQI777Z/bundle.json","state":"https://pith.science/pith/KD37OJPHLG7ECMVMI4CUQI777Z/state.json","well_known_bundle":"https://pith.science/.well-known/pith/KD37OJPHLG7ECMVMI4CUQI777Z/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:KD37OJPHLG7ECMVMI4CUQI777Z","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7fcbce155f5a653bc4d9cb8f097abf3f8237354f11cd20731ddaa486a6d9045c","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T03:50:54Z","title_canon_sha256":"1373ba8a1da4cf3771021660241b7d85437ab61e60bb136f615a8630a94fcbb4"},"schema_version":"1.0","source":{"id":"2605.12960","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12960","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12960v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12960","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"KD37OJPHLG7E","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"KD37OJPHLG7ECMVM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"KD37OJPH","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:4bd7fcc40ca26de66fd483288f4aea2225ac65a5d6d0ca3bf5f4c25de910091e","target":"graph","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments on multilingual benchmarks in both text-only and vision-language settings, covering 57 languages across LLaVA- and Qwen-based backbones, show that DiM3 consistently outperforms existing merging baselines, substantially improves multilingual performance over the original multimodal model, and remains competitive with dedicated multilingual multimodal fine-tuning while largely retaining general multimodal ability."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that multilingual and multimodal residual updates are heterogeneous in a way that can be selectively composed per parameter dimension using direction and magnitude awareness without unintended interference in the shared backbone."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"DiM3 merges multilingual and multimodal model updates in a direction- and magnitude-aware way to enhance multilingual performance in vision-language models while preserving original multimodal abilities."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Selective merging of direction- and magnitude-aware residual updates injects multilingual capability into multimodal models without training."}],"snapshot_sha256":"cbdb4a268a855c88063361c87ba78ed90c65cea8d1a1eff33b5223b4ca5d577b"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"66d321191886a75e81fbc8cce96f6159357a43a4c9b375152ade61254dcde285"},"paper":{"abstract_excerpt":"Towards more general and human-like intelligence, large language models should seamlessly integrate both multilingual and multimodal capabilities; however, extending an existing multimodal model to many languages typically requires expensive multilingual multimodal data construction and repeated end-to-end retraining. We study a training-free alternative: injecting multilingual capability into an existing multimodal model by composing residual updates in the shared language model backbone. The key challenge is that multilingual and multimodal updates are heterogeneous, reflecting different fun","authors_text":"Daling Wang, Ercong Nie, Hinrich Sch\\\"utze, Mengjie Zhao, Mingyang Wang, Shi Feng, Xiaocui Yang, Yongkang Liu, Zijing Wang","cross_cats":[],"headline":"Selective merging of direction- and magnitude-aware residual updates injects multilingual capability into multimodal models without training.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T03:50:54Z","title":"DiM\\textsuperscript{3}: Bridging Multilingual and Multimodal Models via Direction- and Magnitude-Aware Merging"},"references":{"count":71,"internal_anchors":10,"resolved_work":71,"sample":[{"cited_arxiv_id":"2601.03267","doi":"","is_internal_anchor":true,"ref_index":1,"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","year":2025},{"cited_arxiv_id":"2312.11805","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","year":2023},{"cited_arxiv_id":"2508.18265","doi":"","is_internal_anchor":true,"ref_index":3,"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","year":2025},{"cited_arxiv_id":"2505.09388","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"xgqa: Cross-lingual visual question answering","work_id":"93a68a65-d177-45cb-9504-73968ce065d6","year":2022}],"snapshot_sha256":"8cf55dd53e880d80fcb9d8010606f3391f71c8e3b6bcbb91f4ed89f6acfc1866"},"source":{"id":"2605.12960","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:20:37.605222Z","id":"0c13a54d-4db7-4fa8-88fa-b743072e50ed","model_set":{"reader":"grok-4.3"},"one_line_summary":"DiM3 merges multilingual and multimodal model updates in a direction- and magnitude-aware way to enhance multilingual performance in vision-language models while preserving original multimodal abilities.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Selective merging of direction- and magnitude-aware residual updates injects multilingual capability into multimodal models without training.","strongest_claim":"Experiments on multilingual benchmarks in both text-only and vision-language settings, covering 57 languages across LLaVA- and Qwen-based backbones, show that DiM3 consistently outperforms existing merging baselines, substantially improves multilingual performance over the original multimodal model, and remains competitive with dedicated multilingual multimodal fine-tuning while largely retaining general multimodal ability.","weakest_assumption":"The assumption that multilingual and multimodal residual updates are heterogeneous in a way that can be selectively composed per parameter dimension using direction and magnitude awareness without unintended interference in the shared backbone."}},"verdict_id":"0c13a54d-4db7-4fa8-88fa-b743072e50ed"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4422bc2064729a1223176478b96e5050deef504fc6714ac34ce4bab8090d9286","target":"record","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7fcbce155f5a653bc4d9cb8f097abf3f8237354f11cd20731ddaa486a6d9045c","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T03:50:54Z","title_canon_sha256":"1373ba8a1da4cf3771021660241b7d85437ab61e60bb136f615a8630a94fcbb4"},"schema_version":"1.0","source":{"id":"2605.12960","kind":"arxiv","version":1}},"canonical_sha256":"50f7f725e759be4132ac47054823fffe45fae4028a59fa662ff78cdd107f01a5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"50f7f725e759be4132ac47054823fffe45fae4028a59fa662ff78cdd107f01a5","first_computed_at":"2026-05-18T03:09:09.204085Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:09.204085Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0m9f38x5ck4Q9hdWOvZc/FqzimMcUyy5BIUXIGbkEd5SphPXbqSxXs9zfXiy5ZuJVODIdxSclgqX13+rX7XaDQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:09.204848Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12960","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:4422bc2064729a1223176478b96e5050deef504fc6714ac34ce4bab8090d9286","sha256:4bd7fcc40ca26de66fd483288f4aea2225ac65a5d6d0ca3bf5f4c25de910091e"],"state_sha256":"3281651f0b8adf943c3d8e9332ee3264dca55aa295c1c25af0018ac73bffd8f9"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"d+kZKW5HQa2jjr0kfgN3fHsqE159ZgdJ8o1b/yIcYLeG4xoStPGo5d34S8xnXtqM6mCKn0IDk0s20vefmBLGDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T16:54:25.531525Z","bundle_sha256":"014209f9c3168c1a9bf5b6ee68d22143aac8baf2e5fca45591a3868f7bf8d569"}}