{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:H2ZNH2MVF5EULZMDPIEV6I7DEZ","short_pith_number":"pith:H2ZNH2MV","canonical_record":{"source":{"id":"2604.20665","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:15:32Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"f3cf5d5fc844f5b1f4922f230e263fa382523ebddaa753ffad22d1717572d702","abstract_canon_sha256":"67b23d5c0fdb88816e23c3f95679819c77e6286bde38d88f32b15801dd2cdd11"},"schema_version":"1.0"},"canonical_sha256":"3eb2d3e9952f4945e5837a095f23e32677716bfe0b25edca762fe6cf458376fa","source":{"kind":"arxiv","id":"2604.20665","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.20665","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"arxiv_version","alias_value":"2604.20665v2","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.20665","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"pith_short_12","alias_value":"H2ZNH2MVF5EU","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"pith_short_16","alias_value":"H2ZNH2MVF5EULZMD","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"pith_short_8","alias_value":"H2ZNH2MV","created_at":"2026-05-22T01:04:02Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:H2ZNH2MVF5EULZMDPIEV6I7DEZ","target":"record","payload":{"canonical_record":{"source":{"id":"2604.20665","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:15:32Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"f3cf5d5fc844f5b1f4922f230e263fa382523ebddaa753ffad22d1717572d702","abstract_canon_sha256":"67b23d5c0fdb88816e23c3f95679819c77e6286bde38d88f32b15801dd2cdd11"},"schema_version":"1.0"},"canonical_sha256":"3eb2d3e9952f4945e5837a095f23e32677716bfe0b25edca762fe6cf458376fa","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:04:02.885901Z","signature_b64":"y1DT9CGbo/fSlaeK1/6nu3l6azLOPYHSA9Qc4uEuVHcN+IpX719cn1TsdiHjq0gdArrdiAXboDTpbvbRXeOUDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3eb2d3e9952f4945e5837a095f23e32677716bfe0b25edca762fe6cf458376fa","last_reissued_at":"2026-05-22T01:04:02.885254Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:04:02.885254Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.20665","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:04:02Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"6HjKV0LDfGBCI2fHITo45/SHS1e84CUq+IPSav+oo3Rn2mj/BdjwUDH65bLLKX+ESRh24AVuH+OSNQBoWzD6CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-22T13:57:56.846912Z"},"content_sha256":"3b8ca99fb1e350817817d31733374766fcc51621ee9ca0704bc2ca62cf8105ef","schema_version":"1.0","event_id":"sha256:3b8ca99fb1e350817817d31733374766fcc51621ee9ca0704bc2ca62cf8105ef"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:H2ZNH2MVF5EULZMDPIEV6I7DEZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"The Expense of Seeing: Attaining Trustworthy Multimodal Reasoning Within the Monolithic Paradigm","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"Vision-language models bypass visual input using language priors, with the penalty increasing as language models scale.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Dikshant Kukreja, Karan Goyal","submitted_at":"2026-04-22T15:15:32Z","abstract_excerpt":"The rapid proliferation of Vision-Language Models (VLMs) is often framed as enabling unified multimodal knowledge discovery but rests on an under-examined assumption: that current VLMs faithfully synthesise multimodal data. We argue they often do not, and this gap reflects a trustworthiness problem in the dominant Vision Encoder-Projector-LLM paradigm. Rather than extracting grounded knowledge from visual inputs, state-of-the-art models frequently exhibit functional blindness, i.e., exploiting strong language priors to bypass severe visual representation bottlenecks. In this work, we challenge"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"state-of-the-art models frequently exhibit functional blindness, i.e., exploiting strong language priors to bypass severe visual representation bottlenecks... hypothesising that as the underlying language engines scale to unprecedented reasoning capabilities, the mathematical penalty of the visual knowledge bottleneck paradoxically increases.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the Modality Translation Protocol can isolate architectural incapacity from dataset biases without introducing its own translation artifacts or new priors, and that the proposed metrics validly quantify the visual bottleneck.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Vision-language models exhibit functional blindness by exploiting language priors over visual representations; the Modality Translation Protocol and metrics like Toll, Curse, and Fallacy of Seeing reveal this, supporting a Divergence Law where larger language models increase the visual penalty.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Vision-language models bypass visual input using language priors, with the penalty increasing as language models scale.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"88a8e11ffc369c7a2a835878dd3e66cb4adcd180242e39fcda04ea6751cae907"},"source":{"id":"2604.20665","kind":"arxiv","version":2},"verdict":{"id":"635a7ee5-ae08-43b5-a840-b4517fe25b99","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-09T23:52:26.097063Z","strongest_claim":"state-of-the-art models frequently exhibit functional blindness, i.e., exploiting strong language priors to bypass severe visual representation bottlenecks... hypothesising that as the underlying language engines scale to unprecedented reasoning capabilities, the mathematical penalty of the visual knowledge bottleneck paradoxically increases.","one_line_summary":"Vision-language models exhibit functional blindness by exploiting language priors over visual representations; the Modality Translation Protocol and metrics like Toll, Curse, and Fallacy of Seeing reveal this, supporting a Divergence Law where larger language models increase the visual penalty.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the Modality Translation Protocol can isolate architectural incapacity from dataset biases without introducing its own translation artifacts or new priors, and that the proposed metrics validly quantify the visual bottleneck.","pith_extraction_headline":"Vision-language models bypass visual input using language priors, with the penalty increasing as language models scale."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.20665/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-21T14:35:07.940463Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-20T01:40:52.127381Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"8d265ef19452c01c618c4a91bb2d9985b4d0b11c21ee7af2f68a722e30b10580"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"635a7ee5-ae08-43b5-a840-b4517fe25b99"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:04:02Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"LkOS+CwE7IGhBx0ExYYnKvJys41VoPd9BYctTkki3q6BsiI7c1Lfpx8iPmVWXzWaPbBWXn1egXMGIsIzVL+YAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-22T13:57:56.847537Z"},"content_sha256":"4fb77e00776200cf1886859c64fa0c54abcb6625a2d0d1d243d241835d057664","schema_version":"1.0","event_id":"sha256:4fb77e00776200cf1886859c64fa0c54abcb6625a2d0d1d243d241835d057664"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ/bundle.json","state_url":"https://pith.science/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-22T13:57:56Z","links":{"resolver":"https://pith.science/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ","bundle":"https://pith.science/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ/bundle.json","state":"https://pith.science/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/H2ZNH2MVF5EULZMDPIEV6I7DEZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:H2ZNH2MVF5EULZMDPIEV6I7DEZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"67b23d5c0fdb88816e23c3f95679819c77e6286bde38d88f32b15801dd2cdd11","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:15:32Z","title_canon_sha256":"f3cf5d5fc844f5b1f4922f230e263fa382523ebddaa753ffad22d1717572d702"},"schema_version":"1.0","source":{"id":"2604.20665","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.20665","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"arxiv_version","alias_value":"2604.20665v2","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.20665","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"pith_short_12","alias_value":"H2ZNH2MVF5EU","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"pith_short_16","alias_value":"H2ZNH2MVF5EULZMD","created_at":"2026-05-22T01:04:02Z"},{"alias_kind":"pith_short_8","alias_value":"H2ZNH2MV","created_at":"2026-05-22T01:04:02Z"}],"graph_snapshots":[{"event_id":"sha256:4fb77e00776200cf1886859c64fa0c54abcb6625a2d0d1d243d241835d057664","target":"graph","created_at":"2026-05-22T01:04:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"state-of-the-art models frequently exhibit functional blindness, i.e., exploiting strong language priors to bypass severe visual representation bottlenecks... hypothesising that as the underlying language engines scale to unprecedented reasoning capabilities, the mathematical penalty of the visual knowledge bottleneck paradoxically increases."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the Modality Translation Protocol can isolate architectural incapacity from dataset biases without introducing its own translation artifacts or new priors, and that the proposed metrics validly quantify the visual bottleneck."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Vision-language models exhibit functional blindness by exploiting language priors over visual representations; the Modality Translation Protocol and metrics like Toll, Curse, and Fallacy of Seeing reveal this, supporting a Divergence Law where larger language models increase the visual penalty."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Vision-language models bypass visual input using language priors, with the penalty increasing as language models scale."}],"snapshot_sha256":"88a8e11ffc369c7a2a835878dd3e66cb4adcd180242e39fcda04ea6751cae907"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-21T14:35:07.940463Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-20T01:40:52.127381Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.20665/integrity.json","findings":[],"snapshot_sha256":"8d265ef19452c01c618c4a91bb2d9985b4d0b11c21ee7af2f68a722e30b10580","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"The rapid proliferation of Vision-Language Models (VLMs) is often framed as enabling unified multimodal knowledge discovery but rests on an under-examined assumption: that current VLMs faithfully synthesise multimodal data. We argue they often do not, and this gap reflects a trustworthiness problem in the dominant Vision Encoder-Projector-LLM paradigm. Rather than extracting grounded knowledge from visual inputs, state-of-the-art models frequently exhibit functional blindness, i.e., exploiting strong language priors to bypass severe visual representation bottlenecks. In this work, we challenge","authors_text":"Dikshant Kukreja, Karan Goyal","cross_cats":["cs.AI"],"headline":"Vision-language models bypass visual input using language priors, with the penalty increasing as language models scale.","license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:15:32Z","title":"The Expense of Seeing: Attaining Trustworthy Multimodal Reasoning Within the Monolithic Paradigm"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.20665","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-09T23:52:26.097063Z","id":"635a7ee5-ae08-43b5-a840-b4517fe25b99","model_set":{"reader":"grok-4.3"},"one_line_summary":"Vision-language models exhibit functional blindness by exploiting language priors over visual representations; the Modality Translation Protocol and metrics like Toll, Curse, and Fallacy of Seeing reveal this, supporting a Divergence Law where larger language models increase the visual penalty.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Vision-language models bypass visual input using language priors, with the penalty increasing as language models scale.","strongest_claim":"state-of-the-art models frequently exhibit functional blindness, i.e., exploiting strong language priors to bypass severe visual representation bottlenecks... hypothesising that as the underlying language engines scale to unprecedented reasoning capabilities, the mathematical penalty of the visual knowledge bottleneck paradoxically increases.","weakest_assumption":"That the Modality Translation Protocol can isolate architectural incapacity from dataset biases without introducing its own translation artifacts or new priors, and that the proposed metrics validly quantify the visual bottleneck."}},"verdict_id":"635a7ee5-ae08-43b5-a840-b4517fe25b99"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3b8ca99fb1e350817817d31733374766fcc51621ee9ca0704bc2ca62cf8105ef","target":"record","created_at":"2026-05-22T01:04:02Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"67b23d5c0fdb88816e23c3f95679819c77e6286bde38d88f32b15801dd2cdd11","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-22T15:15:32Z","title_canon_sha256":"f3cf5d5fc844f5b1f4922f230e263fa382523ebddaa753ffad22d1717572d702"},"schema_version":"1.0","source":{"id":"2604.20665","kind":"arxiv","version":2}},"canonical_sha256":"3eb2d3e9952f4945e5837a095f23e32677716bfe0b25edca762fe6cf458376fa","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3eb2d3e9952f4945e5837a095f23e32677716bfe0b25edca762fe6cf458376fa","first_computed_at":"2026-05-22T01:04:02.885254Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-22T01:04:02.885254Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"y1DT9CGbo/fSlaeK1/6nu3l6azLOPYHSA9Qc4uEuVHcN+IpX719cn1TsdiHjq0gdArrdiAXboDTpbvbRXeOUDQ==","signature_status":"signed_v1","signed_at":"2026-05-22T01:04:02.885901Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.20665","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3b8ca99fb1e350817817d31733374766fcc51621ee9ca0704bc2ca62cf8105ef","sha256:4fb77e00776200cf1886859c64fa0c54abcb6625a2d0d1d243d241835d057664"],"state_sha256":"0908364742741df8fee1af13d9d35104abe89e41b458c0a44fcef033387235b2"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SHDyo4MQ/6TPrBXxXwiGF8T5+eXMTXWv8QYN7Ehg9LqCp8xnhPkYG5yvvj0gIYmJAjsGwb1k4FSGtSsBJpg+CQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-22T13:57:56.851274Z","bundle_sha256":"d23909db96b5628e3ae7ee0bbdda369a22e837ab2baaf0fda787b08d89a8f493"}}