{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:WZ34WRPFD24Y2BKDMFRBPVSYWP","short_pith_number":"pith:WZ34WRPF","canonical_record":{"source":{"id":"2412.16855","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-12-22T04:40:24Z","cross_cats_sorted":["cs.IR"],"title_canon_sha256":"ea6af16b54e7eb7912e00c33bd2c62ddb3a35dd38ab597b5206a3f4fbb5d0b62","abstract_canon_sha256":"748847fa281d19d5e0a56770ec05a9bba758fa4e7ad976dc2000ca0e11f1851a"},"schema_version":"1.0"},"canonical_sha256":"b677cb45e51eb98d0543616217d658b3dd0c9b77e6a47833ac9b256373655b97","source":{"kind":"arxiv","id":"2412.16855","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.16855","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2412.16855v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.16855","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"WZ34WRPFD24Y","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WZ34WRPFD24Y2BKD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WZ34WRPF","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:WZ34WRPFD24Y2BKDMFRBPVSYWP","target":"record","payload":{"canonical_record":{"source":{"id":"2412.16855","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-12-22T04:40:24Z","cross_cats_sorted":["cs.IR"],"title_canon_sha256":"ea6af16b54e7eb7912e00c33bd2c62ddb3a35dd38ab597b5206a3f4fbb5d0b62","abstract_canon_sha256":"748847fa281d19d5e0a56770ec05a9bba758fa4e7ad976dc2000ca0e11f1851a"},"schema_version":"1.0"},"canonical_sha256":"b677cb45e51eb98d0543616217d658b3dd0c9b77e6a47833ac9b256373655b97","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.248036Z","signature_b64":"2NlEVssAlVJgCmLNge9ovOAAbZn+SW9l80xSurnVwoWqDjWzJ6dgCDxxOXmhNx6uB8zui0fG+42y2cp3LPT7Bg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b677cb45e51eb98d0543616217d658b3dd0c9b77e6a47833ac9b256373655b97","last_reissued_at":"2026-05-17T23:38:53.247396Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.247396Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2412.16855","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Rl1IDTujkEe7NwYQDYcVzlvTASVIZFg1iPmJQQoqft63GfHVCqlDcGCWMYRKBXObue1MBdO5AnGBEuS7pOSnDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T20:03:24.311578Z"},"content_sha256":"2c14cf3cba210e92260418059876731a84c5c950072cfc5fcd60f75d8d9f12de","schema_version":"1.0","event_id":"sha256:2c14cf3cba210e92260418059876731a84c5c950072cfc5fcd60f75d8d9f12de"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:WZ34WRPFD24Y2BKDMFRBPVSYWP","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GME: Improving Universal Multimodal Retrieval by Multimodal LLMs","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Training an MLLM on synthetically balanced fused text-image data produces a single dense retriever that leads on universal multimodal search tasks.","cross_cats":["cs.IR"],"primary_cat":"cs.CL","authors_text":"Dingkun Long, Meishan Zhang, Mingxin Li, Min Zhang, Pengjun Xie, Wenjie Li, Wen Xie, Xin Zhang, Yanzhao Zhang, Ziqi Dai","submitted_at":"2024-12-22T04:40:24Z","abstract_excerpt":"Universal Multimodal Retrieval (UMR) aims to enable search across various modalities using a unified model, where queries and candidates can consist of pure text, images, or a combination of both. Previous work has attempted to adopt multimodal large language models (MLLMs) to realize UMR using only text data. However, our preliminary experiments demonstrate that more diverse multimodal training data can further unlock the potential of MLLMs. Despite its effectiveness, the existing multimodal training data is highly imbalanced in terms of modality, which motivates us to develop a training data"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experimental results show that our method achieves state-of-the-art performance among existing UMR methods.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the synthetic fused-modal training dataset is of high quality and sufficiently diverse to unlock the full potential of MLLMs for universal multimodal retrieval without introducing biases or artifacts.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GME achieves state-of-the-art results in universal multimodal retrieval by training on a balanced synthetic multimodal dataset.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Training an MLLM on synthetically balanced fused text-image data produces a single dense retriever that leads on universal multimodal search tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7391d1dcc42778fcf2b5a7552aaa16498aa1df4333a5ae5f014102366d3a3f30"},"source":{"id":"2412.16855","kind":"arxiv","version":2},"verdict":{"id":"70f17f27-5ab2-405f-861c-74df3ec6f8c6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T06:31:17.683583Z","strongest_claim":"Experimental results show that our method achieves state-of-the-art performance among existing UMR methods.","one_line_summary":"GME achieves state-of-the-art results in universal multimodal retrieval by training on a balanced synthetic multimodal dataset.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the synthetic fused-modal training dataset is of high quality and sufficiently diverse to unlock the full potential of MLLMs for universal multimodal retrieval without introducing biases or artifacts.","pith_extraction_headline":"Training an MLLM on synthetically balanced fused text-image data produces a single dense retriever that leads on universal multimodal search tasks."},"references":{"count":86,"sample":[{"doi":"","year":2020,"title":"Overview of touch ´e 2020: Argument retrieval - extended abstract","work_id":"6f0e6b93-d715-48fb-a68b-eea2dd6f658f","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"A full-text learning to rank dataset for medical information retrieval","work_id":"1ffb6db3-12fa-4a54-a937-d43eb8fe19ae","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Sub- biah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakan- tan, Pranav Shyam, Girish Sastry, Amanda Askell, Sand- hini Agarwal, Ariel Herbert-V oss, ","work_id":"625c70f2-e4ac-4dfb-89e1-1b062618e14d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Webqa: Multihop and multimodal QA","work_id":"62462bb5-05db-482f-844b-c2ab8a75925d","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Training Deep Nets with Sublinear Memory Cost","work_id":"f2c5c287-a500-40e4-a136-e7e3172db1d7","ref_index":5,"cited_arxiv_id":"1604.06174","is_internal_anchor":true}],"resolved_work":86,"snapshot_sha256":"545c259cae8b81c6a2a04e5d7160b1de7ea401847645d347a3a47e11a27c1f5f","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e91c7ab870fe8bc20fedae355cc788a12da726078e63b71a75da8ece9325dc3e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"70f17f27-5ab2-405f-861c-74df3ec6f8c6"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YyIt4XmU6Qr8fPAR47VAXGTxGetdDH2wg4yQiHsYnS6g2YfWxN8yWCH5yAA9UetQOYzDoGH6r4tHuI2kN6BODg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T20:03:24.312588Z"},"content_sha256":"9683c27c5bfab13d1c30384d59ddaf9142b7c2a185a12f629f67002c5e7fa947","schema_version":"1.0","event_id":"sha256:9683c27c5bfab13d1c30384d59ddaf9142b7c2a185a12f629f67002c5e7fa947"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP/bundle.json","state_url":"https://pith.science/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T20:03:24Z","links":{"resolver":"https://pith.science/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP","bundle":"https://pith.science/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP/bundle.json","state":"https://pith.science/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WZ34WRPFD24Y2BKDMFRBPVSYWP/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:WZ34WRPFD24Y2BKDMFRBPVSYWP","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"748847fa281d19d5e0a56770ec05a9bba758fa4e7ad976dc2000ca0e11f1851a","cross_cats_sorted":["cs.IR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-12-22T04:40:24Z","title_canon_sha256":"ea6af16b54e7eb7912e00c33bd2c62ddb3a35dd38ab597b5206a3f4fbb5d0b62"},"schema_version":"1.0","source":{"id":"2412.16855","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2412.16855","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2412.16855v2","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.16855","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"WZ34WRPFD24Y","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WZ34WRPFD24Y2BKD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WZ34WRPF","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9683c27c5bfab13d1c30384d59ddaf9142b7c2a185a12f629f67002c5e7fa947","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results show that our method achieves state-of-the-art performance among existing UMR methods."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the synthetic fused-modal training dataset is of high quality and sufficiently diverse to unlock the full potential of MLLMs for universal multimodal retrieval without introducing biases or artifacts."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GME achieves state-of-the-art results in universal multimodal retrieval by training on a balanced synthetic multimodal dataset."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Training an MLLM on synthetically balanced fused text-image data produces a single dense retriever that leads on universal multimodal search tasks."}],"snapshot_sha256":"7391d1dcc42778fcf2b5a7552aaa16498aa1df4333a5ae5f014102366d3a3f30"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e91c7ab870fe8bc20fedae355cc788a12da726078e63b71a75da8ece9325dc3e"},"paper":{"abstract_excerpt":"Universal Multimodal Retrieval (UMR) aims to enable search across various modalities using a unified model, where queries and candidates can consist of pure text, images, or a combination of both. Previous work has attempted to adopt multimodal large language models (MLLMs) to realize UMR using only text data. However, our preliminary experiments demonstrate that more diverse multimodal training data can further unlock the potential of MLLMs. Despite its effectiveness, the existing multimodal training data is highly imbalanced in terms of modality, which motivates us to develop a training data","authors_text":"Dingkun Long, Meishan Zhang, Mingxin Li, Min Zhang, Pengjun Xie, Wenjie Li, Wen Xie, Xin Zhang, Yanzhao Zhang, Ziqi Dai","cross_cats":["cs.IR"],"headline":"Training an MLLM on synthetically balanced fused text-image data produces a single dense retriever that leads on universal multimodal search tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-12-22T04:40:24Z","title":"GME: Improving Universal Multimodal Retrieval by Multimodal LLMs"},"references":{"count":86,"internal_anchors":9,"resolved_work":86,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Overview of touch ´e 2020: Argument retrieval - extended abstract","work_id":"6f0e6b93-d715-48fb-a68b-eea2dd6f658f","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"A full-text learning to rank dataset for medical information retrieval","work_id":"1ffb6db3-12fa-4a54-a937-d43eb8fe19ae","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Sub- biah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakan- tan, Pranav Shyam, Girish Sastry, Amanda Askell, Sand- hini Agarwal, Ariel Herbert-V oss, ","work_id":"625c70f2-e4ac-4dfb-89e1-1b062618e14d","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Webqa: Multihop and multimodal QA","work_id":"62462bb5-05db-482f-844b-c2ab8a75925d","year":2022},{"cited_arxiv_id":"1604.06174","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Training Deep Nets with Sublinear Memory Cost","work_id":"f2c5c287-a500-40e4-a136-e7e3172db1d7","year":2016}],"snapshot_sha256":"545c259cae8b81c6a2a04e5d7160b1de7ea401847645d347a3a47e11a27c1f5f"},"source":{"id":"2412.16855","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T06:31:17.683583Z","id":"70f17f27-5ab2-405f-861c-74df3ec6f8c6","model_set":{"reader":"grok-4.3"},"one_line_summary":"GME achieves state-of-the-art results in universal multimodal retrieval by training on a balanced synthetic multimodal dataset.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Training an MLLM on synthetically balanced fused text-image data produces a single dense retriever that leads on universal multimodal search tasks.","strongest_claim":"Experimental results show that our method achieves state-of-the-art performance among existing UMR methods.","weakest_assumption":"That the synthetic fused-modal training dataset is of high quality and sufficiently diverse to unlock the full potential of MLLMs for universal multimodal retrieval without introducing biases or artifacts."}},"verdict_id":"70f17f27-5ab2-405f-861c-74df3ec6f8c6"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2c14cf3cba210e92260418059876731a84c5c950072cfc5fcd60f75d8d9f12de","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"748847fa281d19d5e0a56770ec05a9bba758fa4e7ad976dc2000ca0e11f1851a","cross_cats_sorted":["cs.IR"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-12-22T04:40:24Z","title_canon_sha256":"ea6af16b54e7eb7912e00c33bd2c62ddb3a35dd38ab597b5206a3f4fbb5d0b62"},"schema_version":"1.0","source":{"id":"2412.16855","kind":"arxiv","version":2}},"canonical_sha256":"b677cb45e51eb98d0543616217d658b3dd0c9b77e6a47833ac9b256373655b97","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b677cb45e51eb98d0543616217d658b3dd0c9b77e6a47833ac9b256373655b97","first_computed_at":"2026-05-17T23:38:53.247396Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.247396Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"2NlEVssAlVJgCmLNge9ovOAAbZn+SW9l80xSurnVwoWqDjWzJ6dgCDxxOXmhNx6uB8zui0fG+42y2cp3LPT7Bg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.248036Z","signed_message":"canonical_sha256_bytes"},"source_id":"2412.16855","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2c14cf3cba210e92260418059876731a84c5c950072cfc5fcd60f75d8d9f12de","sha256:9683c27c5bfab13d1c30384d59ddaf9142b7c2a185a12f629f67002c5e7fa947"],"state_sha256":"0e4921923e8cd6c125de71e046178d7e57a010b61af5f9f59f0a48d0bb5251bc"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"l/+KzNBMs9r6glMHAHHvWC4K8HV0ZVBAc9aeStZTdEhfmDGcWPDhAIphM0C2vcnfrwB8JjlHeCl9ov5WphC+AA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T20:03:24.316823Z","bundle_sha256":"bdb16100aa168321d208889b54d698964af6b4b47c5c5e0122c2a6aabd352f44"}}