{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:S5L2URC5ZEDCPHWNOA6RLD3LUA","short_pith_number":"pith:S5L2URC5","schema_version":"1.0","canonical_sha256":"9757aa445dc906279ecd703d158f6ba01526152019d022966772f137a68c3dbd","source":{"kind":"arxiv","id":"2406.13621","version":2},"attestation_state":"computed","paper":{"title":"LaMI: Augmenting Large Language Models via Late Multi-Image Fusion","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Guy Yariv, Idan Schwartz, Sagie Benaim, Yossi Adi","submitted_at":"2024-06-19T15:17:10Z","abstract_excerpt":"Commonsense reasoning often requires both textual and visual knowledge, yet Large Language Models (LLMs) trained solely on text lack visual grounding (e.g., \"what color is an emperor penguin's belly?\"). Visual Language Models (VLMs) perform better on visually grounded tasks but face two limitations: (i) often reduced performance on text-only commonsense reasoning compared to text-trained LLMs, and (ii) adapting newly released LLMs to vision input typically requires costly multimodal training. An alternative augments LLMs with test-time visual signals, improving visual commonsense without harmi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.13621","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-19T15:17:10Z","cross_cats_sorted":["cs.CV","cs.LG"],"title_canon_sha256":"3323804a7f6c9c26d4273267201917dfc865b4ec8dcdb409912136f4cfa59e39","abstract_canon_sha256":"04930fdf8e8782c55db1e2c1aa8993fb1767e9b84afe016ade8dc829e439e4bb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T15:04:06.551346Z","signature_b64":"wLwdPeJW/jXG4KB4FbVVI4qIdbLa1Q+Y02YfqclEaeRzTJXS8vH5gKqo2pYgsxcmP7B3AsBKabIjG6MejXTLCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9757aa445dc906279ecd703d158f6ba01526152019d022966772f137a68c3dbd","last_reissued_at":"2026-05-18T15:04:06.548944Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T15:04:06.548944Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LaMI: Augmenting Large Language Models via Late Multi-Image Fusion","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.LG"],"primary_cat":"cs.CL","authors_text":"Guy Yariv, Idan Schwartz, Sagie Benaim, Yossi Adi","submitted_at":"2024-06-19T15:17:10Z","abstract_excerpt":"Commonsense reasoning often requires both textual and visual knowledge, yet Large Language Models (LLMs) trained solely on text lack visual grounding (e.g., \"what color is an emperor penguin's belly?\"). Visual Language Models (VLMs) perform better on visually grounded tasks but face two limitations: (i) often reduced performance on text-only commonsense reasoning compared to text-trained LLMs, and (ii) adapting newly released LLMs to vision input typically requires costly multimodal training. An alternative augments LLMs with test-time visual signals, improving visual commonsense without harmi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2406.13621","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.13621","created_at":"2026-05-18T15:04:06.549074+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.13621v2","created_at":"2026-05-18T15:04:06.549074+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.13621","created_at":"2026-05-18T15:04:06.549074+00:00"},{"alias_kind":"pith_short_12","alias_value":"S5L2URC5ZEDC","created_at":"2026-05-18T15:04:06.549074+00:00"},{"alias_kind":"pith_short_16","alias_value":"S5L2URC5ZEDCPHWN","created_at":"2026-05-18T15:04:06.549074+00:00"},{"alias_kind":"pith_short_8","alias_value":"S5L2URC5","created_at":"2026-05-18T15:04:06.549074+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA","json":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA.json","graph_json":"https://pith.science/api/pith-number/S5L2URC5ZEDCPHWNOA6RLD3LUA/graph.json","events_json":"https://pith.science/api/pith-number/S5L2URC5ZEDCPHWNOA6RLD3LUA/events.json","paper":"https://pith.science/paper/S5L2URC5"},"agent_actions":{"view_html":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA","download_json":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA.json","view_paper":"https://pith.science/paper/S5L2URC5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.13621&json=true","fetch_graph":"https://pith.science/api/pith-number/S5L2URC5ZEDCPHWNOA6RLD3LUA/graph.json","fetch_events":"https://pith.science/api/pith-number/S5L2URC5ZEDCPHWNOA6RLD3LUA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA/action/storage_attestation","attest_author":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA/action/author_attestation","sign_citation":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA/action/citation_signature","submit_replication":"https://pith.science/pith/S5L2URC5ZEDCPHWNOA6RLD3LUA/action/replication_record"}},"created_at":"2026-05-18T15:04:06.549074+00:00","updated_at":"2026-05-18T15:04:06.549074+00:00"}