{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:2PLSOTKMJC5BHO7J2HNOK3U4QI","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"168ff8447d6fe9e085404367c568610dd7762a51ab443173cd64288d00412d73","cross_cats_sorted":["cs.AI","cs.CV"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-03T10:03:02Z","title_canon_sha256":"52448e062150832b7734239c8c9f7801656057475077fe50bdfba514d65e5be9"},"schema_version":"1.0","source":{"id":"2605.12517","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12517","created_at":"2026-05-18T03:10:02Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12517v1","created_at":"2026-05-18T03:10:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12517","created_at":"2026-05-18T03:10:02Z"},{"alias_kind":"pith_short_12","alias_value":"2PLSOTKMJC5B","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2PLSOTKMJC5BHO7J","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2PLSOTKM","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:0976e09cd182ea033c29d5e4589ed8ad6eeaf42c6cac4e3cbf2e0dcf5954c69b","target":"graph","created_at":"2026-05-18T03:10:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We propose the Latent Imagination Module (LIM), a lightweight cross-attention module that predicts imagined latent embeddings from textual input and feeds them into a frozen VLM backbone without pixel-level image synthesis. Across text-only benchmarks, unseen tasks, and missing-image scenarios, LIM improves accuracy and reduces calibration error."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That cross-attention predictions of latent visual embeddings from text will be sufficiently accurate and compatible to substitute for real visual input inside a frozen VLM without introducing new systematic errors."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A new Latent Imagination Module uses cross-attention to predict latent visual embeddings from text, improving accuracy and calibration of vision-language models on text-only inputs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A lightweight cross-attention module can predict missing visual embeddings from text to restore accuracy and calibration in vision-language models."}],"snapshot_sha256":"8a81ba8799bb41b55c3ea970e75c1546b2905bcaab05e4ed5ecaa8b6e86b6805"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Vision-language models (VLMs) are often deployed on text-only inputs, although they are trained with images. We find that removing the vision modality causes large drops in accuracy and severe miscalibration, and the model does not behave like its original language backbone under text-only prompting. This failure is not explained only by missing semantic information. Even when text descriptions preserve key content, confidence becomes unreliable, while adding a visual signal through generated images partially restores accuracy and calibration. We propose the Latent Imagination Module (LIM), a ","authors_text":"Chaeyun Jang, Juho Lee (Kim Jaechul Graduate School of AI, Jungwon Choi, KAIST), Mingyeong Kim","cross_cats":["cs.AI","cs.CV"],"headline":"A lightweight cross-attention module can predict missing visual embeddings from text to restore accuracy and calibration in vision-language models.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-03T10:03:02Z","title":"Bridging the Missing-Modality Gap: Improving Text-Only Calibration of Vision Language Models"},"references":{"count":13,"internal_anchors":6,"resolved_work":13,"sample":[{"cited_arxiv_id":"1803.05457","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Mm-align: Learning optimal transport- based alignment dynamics for fast and accurate inference on missing modality sequences","work_id":"4bf0f520-a850-406e-ac6f-48cf4893e10d","year":null},{"cited_arxiv_id":"1610.02136","doi":"","is_internal_anchor":true,"ref_index":3,"title":"A Baseline for Detecting Misclassified and Out-of-Distribution Examples in Neural Networks","work_id":"010332fc-8aad-4ab2-a1de-1ddd0a3c4b7f","year":null},{"cited_arxiv_id":"2009.03300","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","year":2009},{"cited_arxiv_id":"2410.21276","doi":"","is_internal_anchor":true,"ref_index":5,"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","year":null}],"snapshot_sha256":"822a2d5d218a133e140878f31eb1721d31b53ffeb4e05d8af714fbd9d1369fb9"},"source":{"id":"2605.12517","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T21:39:26.364228Z","id":"72a622ec-b73a-436a-b6b1-3c537e6dc18c","model_set":{"reader":"grok-4.3"},"one_line_summary":"A new Latent Imagination Module uses cross-attention to predict latent visual embeddings from text, improving accuracy and calibration of vision-language models on text-only inputs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A lightweight cross-attention module can predict missing visual embeddings from text to restore accuracy and calibration in vision-language models.","strongest_claim":"We propose the Latent Imagination Module (LIM), a lightweight cross-attention module that predicts imagined latent embeddings from textual input and feeds them into a frozen VLM backbone without pixel-level image synthesis. Across text-only benchmarks, unseen tasks, and missing-image scenarios, LIM improves accuracy and reduces calibration error.","weakest_assumption":"That cross-attention predictions of latent visual embeddings from text will be sufficiently accurate and compatible to substitute for real visual input inside a frozen VLM without introducing new systematic errors."}},"verdict_id":"72a622ec-b73a-436a-b6b1-3c537e6dc18c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:de764254b27058d44601943438adae89ee194c1351f15551a2bc5fce6ce8aa05","target":"record","created_at":"2026-05-18T03:10:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"168ff8447d6fe9e085404367c568610dd7762a51ab443173cd64288d00412d73","cross_cats_sorted":["cs.AI","cs.CV"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-04-03T10:03:02Z","title_canon_sha256":"52448e062150832b7734239c8c9f7801656057475077fe50bdfba514d65e5be9"},"schema_version":"1.0","source":{"id":"2605.12517","kind":"arxiv","version":1}},"canonical_sha256":"d3d7274d4c48ba13bbe9d1dae56e9c82223af0ca0765f4b2621990fc5d4383e3","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d3d7274d4c48ba13bbe9d1dae56e9c82223af0ca0765f4b2621990fc5d4383e3","first_computed_at":"2026-05-18T03:10:02.899600Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:10:02.899600Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"LnZ371jlmlkt3j24vOkWrTeor52YE0TC2xJp3FenCyW9tcar8OUl8v8K7Kt2VYwh0gPNglgTNzbNHqGqYlgIDg==","signature_status":"signed_v1","signed_at":"2026-05-18T03:10:02.900173Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12517","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:de764254b27058d44601943438adae89ee194c1351f15551a2bc5fce6ce8aa05","sha256:0976e09cd182ea033c29d5e4589ed8ad6eeaf42c6cac4e3cbf2e0dcf5954c69b"],"state_sha256":"6ed89e0899b8645ba66ce36d51606603bcc847dbabe259642fd333340342f2ee"}