{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:2CC62ETSO5IOGBPTRWZ4EQRIIQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"75c5e52bb5f0a30f0f41eb1817f924d839d8a4359caf019c8aa79d5814b85e4d","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-23T04:49:09Z","title_canon_sha256":"209a053a5e9fb6f85c51f383c22f0271269e5bf11b4b94c005535fb95ebeeca0"},"schema_version":"1.0","source":{"id":"2310.14566","kind":"arxiv","version":5}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.14566","created_at":"2026-05-17T23:38:45Z"},{"alias_kind":"arxiv_version","alias_value":"2310.14566v5","created_at":"2026-05-17T23:38:45Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.14566","created_at":"2026-05-17T23:38:45Z"},{"alias_kind":"pith_short_12","alias_value":"2CC62ETSO5IO","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2CC62ETSO5IOGBPT","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2CC62ETS","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:0007a4025961790111c2c45d4a07724726692f4304a4270dc3584c663526dbc4","target":"graph","created_at":"2026-05-17T23:38:45Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"In our evaluation on HallusionBench, we benchmarked 15 different models, highlighting a 31.42% question-pair accuracy achieved by the state-of-the-art GPT-4V. Notably, all other evaluated models achieve accuracy below 16%."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that human-expert-crafted questions with the novel control-group structure accurately isolate and measure entangled language hallucination and visual illusion without introducing confounding biases or subjective interpretations in scoring."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"HallusionBench shows GPT-4V reaches only 31.42% accuracy on paired questions testing language hallucination and visual illusion in LVLMs, with other models below 16%."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"HallusionBench shows even GPT-4V reaches only 31.42 percent accuracy on paired questions that expose language hallucination and visual illusion in vision-language models."}],"snapshot_sha256":"455c88252001f05f636a22d2716650fa8f0fadc62c9c6be0eed86db7df2cff9c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"3fa34980d59bf013942a12d97745133ef222683d99a4935b8c7c25f09303061e"},"paper":{"abstract_excerpt":"We introduce HallusionBench, a comprehensive benchmark designed for the evaluation of image-context reasoning. This benchmark presents significant challenges to advanced large visual-language models (LVLMs), such as GPT-4V(Vision), Gemini Pro Vision, Claude 3, and LLaVA-1.5, by emphasizing nuanced understanding and interpretation of visual data. The benchmark comprises 346 images paired with 1129 questions, all meticulously crafted by human experts. We introduce a novel structure for these visual questions designed to establish control groups. This structure enables us to conduct a quantitativ","authors_text":"Dinesh Manocha, Furong Huang, Fuxiao Liu, Lichang Chen, Ruiqi Xian, Tianrui Guan, Tianyi Zhou, Xiaoyu Liu, Xijun Wang, Xiyang Wu, Yaser Yacoob, Zongxia Li","cross_cats":["cs.CL"],"headline":"HallusionBench shows even GPT-4V reaches only 31.42 percent accuracy on paired questions that expose language hallucination and visual illusion in vision-language models.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-23T04:49:09Z","title":"HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models"},"references":{"count":63,"internal_anchors":22,"resolved_work":63,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Gpt-4v(ision) system card. 2023. 6, 7","work_id":"88a556c3-5f22-4f71-b403-084ceddb10a2","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"nocaps: novel object captioning at scale","work_id":"cf1f8da2-1488-4dd0-8a72-2412fb7c436d","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Flamingo: a visual language model for few-shot learning","work_id":"31e3af5c-9fec-43d9-b533-5bb70172dd15","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Vqa: Visual question answering","work_id":"3db513bc-ec97-47d1-bc83-6eb38b02a2d9","year":2015},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Anas Awadalla, Irena Gao, Joshua Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bit- ton, Samir Gadre, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell","work_id":"585147a1-ddd5-4f10-93ea-dae26c9319b1","year":2023}],"snapshot_sha256":"bbbaaf3be139cf261d8684897cdc7b95496f55bbf49eaea24af242076ee8cc1c"},"source":{"id":"2310.14566","kind":"arxiv","version":5},"verdict":{"created_at":"2026-05-17T01:17:20.225173Z","id":"320843f4-2d58-4ce0-9b75-93564f84ba77","model_set":{"reader":"grok-4.3"},"one_line_summary":"HallusionBench shows GPT-4V reaches only 31.42% accuracy on paired questions testing language hallucination and visual illusion in LVLMs, with other models below 16%.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"HallusionBench shows even GPT-4V reaches only 31.42 percent accuracy on paired questions that expose language hallucination and visual illusion in vision-language models.","strongest_claim":"In our evaluation on HallusionBench, we benchmarked 15 different models, highlighting a 31.42% question-pair accuracy achieved by the state-of-the-art GPT-4V. Notably, all other evaluated models achieve accuracy below 16%.","weakest_assumption":"The assumption that human-expert-crafted questions with the novel control-group structure accurately isolate and measure entangled language hallucination and visual illusion without introducing confounding biases or subjective interpretations in scoring."}},"verdict_id":"320843f4-2d58-4ce0-9b75-93564f84ba77"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2353d24843c3ad9a1fcae0752de971cb190e55cef8900629bf009b617e18ab28","target":"record","created_at":"2026-05-17T23:38:45Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"75c5e52bb5f0a30f0f41eb1817f924d839d8a4359caf019c8aa79d5814b85e4d","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-10-23T04:49:09Z","title_canon_sha256":"209a053a5e9fb6f85c51f383c22f0271269e5bf11b4b94c005535fb95ebeeca0"},"schema_version":"1.0","source":{"id":"2310.14566","kind":"arxiv","version":5}},"canonical_sha256":"d085ed12727750e305f38db3c2422844117bbc33a4caee2b605a4ef71814d040","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d085ed12727750e305f38db3c2422844117bbc33a4caee2b605a4ef71814d040","first_computed_at":"2026-05-17T23:38:45.995639Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:45.995639Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Mux6NtEBTf8U7p10xPc2WABdNekgEqwuiK4iYc/9stWFjMGU8YiM4NAZIlMvk6d9gUG4viHje2mu5I7aIWEDBA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:45.996231Z","signed_message":"canonical_sha256_bytes"},"source_id":"2310.14566","source_kind":"arxiv","source_version":5}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2353d24843c3ad9a1fcae0752de971cb190e55cef8900629bf009b617e18ab28","sha256:0007a4025961790111c2c45d4a07724726692f4304a4270dc3584c663526dbc4"],"state_sha256":"8e28e8b082312eaad323f926faa86cc4a5d415b0e39b8146caf4ec8db74a73ce"}