{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:BVUHCKJKMLOWLHSWXCDKPEPWOF","short_pith_number":"pith:BVUHCKJK","schema_version":"1.0","canonical_sha256":"0d6871292a62dd659e56b886a791f67147551cf245d913b258ebb8f24da4237a","source":{"kind":"arxiv","id":"2606.10400","version":1},"attestation_state":"computed","paper":{"title":"Do Vision-Language Models See or Guess? Measuring and Reducing Textual-Prior Reliance with a Phrasing-Controlled Benchmark","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Paras Chopra, Pratham Singla, Shivank Garg, Vihan Singh","submitted_at":"2026-06-09T04:18:38Z","abstract_excerpt":"Vision-language models (VLMs) are increasingly deployed where answers must follow from what is in the image, yet they often answer from textual priors, the question's phrasing together with memorized world knowledge, rather than from the image itself, which inflates benchmark scores and yields confident but ungrounded answers. Existing benchmarks rarely isolate this behavior, since each image is usually paired with a single fixed question. To measure the reliance, we build a 540-image benchmark across six reasoning categories and generate four question variants over the same images, so that ph"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.10400","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T04:18:38Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"3aa8c1de11bf745ea60f95a137aec08bce14281c2027bbfc3107ffc9de29dbf8","abstract_canon_sha256":"8d59536c35be6a651ea028daf7a55b64209cf374fe7741c722275c6f7bfebcbc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-10T01:10:16.067083Z","signature_b64":"eunoM4zERdPwQRRKuUBbm01aSgVjpS9LwDC7HazWXz/YhPriphaS1Ceqq3QPDkaE7g9Tqk8pAY5eI5lVwckdBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0d6871292a62dd659e56b886a791f67147551cf245d913b258ebb8f24da4237a","last_reissued_at":"2026-06-10T01:10:16.066546Z","signature_status":"signed_v1","first_computed_at":"2026-06-10T01:10:16.066546Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Do Vision-Language Models See or Guess? Measuring and Reducing Textual-Prior Reliance with a Phrasing-Controlled Benchmark","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.CL","authors_text":"Paras Chopra, Pratham Singla, Shivank Garg, Vihan Singh","submitted_at":"2026-06-09T04:18:38Z","abstract_excerpt":"Vision-language models (VLMs) are increasingly deployed where answers must follow from what is in the image, yet they often answer from textual priors, the question's phrasing together with memorized world knowledge, rather than from the image itself, which inflates benchmark scores and yields confident but ungrounded answers. Existing benchmarks rarely isolate this behavior, since each image is usually paired with a single fixed question. To measure the reliance, we build a 540-image benchmark across six reasoning categories and generate four question variants over the same images, so that ph"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.10400","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.10400/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.10400","created_at":"2026-06-10T01:10:16.066598+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.10400v1","created_at":"2026-06-10T01:10:16.066598+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.10400","created_at":"2026-06-10T01:10:16.066598+00:00"},{"alias_kind":"pith_short_12","alias_value":"BVUHCKJKMLOW","created_at":"2026-06-10T01:10:16.066598+00:00"},{"alias_kind":"pith_short_16","alias_value":"BVUHCKJKMLOWLHSW","created_at":"2026-06-10T01:10:16.066598+00:00"},{"alias_kind":"pith_short_8","alias_value":"BVUHCKJK","created_at":"2026-06-10T01:10:16.066598+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF","json":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF.json","graph_json":"https://pith.science/api/pith-number/BVUHCKJKMLOWLHSWXCDKPEPWOF/graph.json","events_json":"https://pith.science/api/pith-number/BVUHCKJKMLOWLHSWXCDKPEPWOF/events.json","paper":"https://pith.science/paper/BVUHCKJK"},"agent_actions":{"view_html":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF","download_json":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF.json","view_paper":"https://pith.science/paper/BVUHCKJK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.10400&json=true","fetch_graph":"https://pith.science/api/pith-number/BVUHCKJKMLOWLHSWXCDKPEPWOF/graph.json","fetch_events":"https://pith.science/api/pith-number/BVUHCKJKMLOWLHSWXCDKPEPWOF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF/action/storage_attestation","attest_author":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF/action/author_attestation","sign_citation":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF/action/citation_signature","submit_replication":"https://pith.science/pith/BVUHCKJKMLOWLHSWXCDKPEPWOF/action/replication_record"}},"created_at":"2026-06-10T01:10:16.066598+00:00","updated_at":"2026-06-10T01:10:16.066598+00:00"}