{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:MSRSOI4JDXJVP72R3PSD7M62RZ","merge_version":"pith-open-graph-merge-v1","event_count":3,"valid_event_count":3,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3cc48e668c912245f6e7b02d6a95d4a1fc4f0fac725d40509a68ba3578c63e0b","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:37:55Z","title_canon_sha256":"2afcd946870287c87c7ae834d6abf29afef5adb0660475cc7d3e2cc7e4ade6ff"},"schema_version":"1.0","source":{"id":"2605.14621","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14621","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14621v1","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14621","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"pith_short_12","alias_value":"MSRSOI4JDXJV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MSRSOI4JDXJVP72R","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MSRSOI4J","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:bae8aa537eaeba893b6db551d17fedac8a385f1db1dc56c0f34f4cb9ba6dd08a","target":"graph","created_at":"2026-05-17T23:39:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments on POPE, CHAIR, and AMBER with Qwen2.5-VL and LLaVA-v1.5 show that SIRA consistently reduces hallucinations while preserving descriptive coverage and incurring lower overhead than two-pass contrastive decoding."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That masking attention to image-token positions in later transformer layers produces a clean language-prior-dominated reference that preserves prompt interpretation and decoding history without introducing new artifacts."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SIRA mitigates hallucinations in LVLMs by internally contrasting full visual access against a masked late-layer branch that retains shared context but lacks fine-grained visual evidence."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Masking attention to image tokens after a shared prefix in vision-language transformers reduces hallucinations by contrasting against an internal language-prior reference."}],"snapshot_sha256":"c8c34c1b1e93e1a23db3d81f202a934adcefd1ab28f03e29efa997b86771b204"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large vision-language models (LVLMs) often hallucinate when language priors dominate weak or ambiguous visual evidence. Existing contrastive decoding methods mitigate this problem by comparing predictions from the original image with those from externally perturbed visual inputs, but such references can introduce off-manifold artifacts and require costly extra forward passes. We propose SIRA, a training-free internal contrastive decoding framework that constructs a counterfactual reference inside the same LVLM by exploiting the staged information flow of multimodal transformers. Instead of rem","authors_text":"Junzhe Chen, Lijie Wen, Qiang Ju, Tian Qin, Tianshu Zhang, Yuqing Shi","cross_cats":["cs.AI","cs.CL"],"headline":"Masking attention to image tokens after a shared prefix in vision-language transformers reduces hallucinations by contrasting against an internal language-prior reference.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:37:55Z","title":"Do We Really Need External Tools to Mitigate Hallucinations? SIRA: Shared-Prefix Internal Reconstruction of Attribution"},"references":{"count":47,"internal_anchors":3,"resolved_work":47,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Mitigating object hallucinations in large vision-language models with assembly of global and local attention","work_id":"09689576-281e-4bc8-af91-dcc13e390290","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Qwen2.5-vl technical report","work_id":"19dbc933-eff7-47da-894e-0cb819f498b9","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Mask what matters: Mitigating object hallucinations in multimodal large language models with object-aligned visual contrastive decoding","work_id":"6af08026-cd8a-4644-940a-f7f976cf00a9","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Ict: Image-object cross-level trusted intervention for mitigating object hallucination in large vision-language models","work_id":"2ab3b73d-dfcf-4f2b-8a50-035585545415","year":2025}],"snapshot_sha256":"2971c54ac69f2c4cc365b2b10c6f37616d107bbfd2bf69c2cbb518f9c91b9c71"},"source":{"id":"2605.14621","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T05:52:51.707966Z","id":"2a7c2c0c-919a-49d5-a869-6f776729af40","model_set":{"reader":"grok-4.3"},"one_line_summary":"SIRA mitigates hallucinations in LVLMs by internally contrasting full visual access against a masked late-layer branch that retains shared context but lacks fine-grained visual evidence.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Masking attention to image tokens after a shared prefix in vision-language transformers reduces hallucinations by contrasting against an internal language-prior reference.","strongest_claim":"Experiments on POPE, CHAIR, and AMBER with Qwen2.5-VL and LLaVA-v1.5 show that SIRA consistently reduces hallucinations while preserving descriptive coverage and incurring lower overhead than two-pass contrastive decoding.","weakest_assumption":"That masking attention to image-token positions in later transformer layers produces a clean language-prior-dominated reference that preserves prompt interpretation and decoding history without introducing new artifacts."}},"verdict_id":"2a7c2c0c-919a-49d5-a869-6f776729af40"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9d16d3b5d5ed6d3fb822ac8d7bb720e4c49816975f57e23742a1ed4a64b43042","target":"record","created_at":"2026-05-17T23:39:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3cc48e668c912245f6e7b02d6a95d4a1fc4f0fac725d40509a68ba3578c63e0b","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T09:37:55Z","title_canon_sha256":"2afcd946870287c87c7ae834d6abf29afef5adb0660475cc7d3e2cc7e4ade6ff"},"schema_version":"1.0","source":{"id":"2605.14621","kind":"arxiv","version":1}},"canonical_sha256":"64a32723891dd357ff51dbe43fb3da8e43a95a41c714eb62c29d62a74e64b649","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"64a32723891dd357ff51dbe43fb3da8e43a95a41c714eb62c29d62a74e64b649","first_computed_at":"2026-05-17T23:39:04.064870Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:04.064870Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"iDSNBnJDkyEfZ/3R0CHpG8dFawMBZ/YVb7V4fwy2tvIi7brCjiswKYYmgLpeQfYqN31fgbslbgLFUbSOetuxAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:04.065521Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14621","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9d16d3b5d5ed6d3fb822ac8d7bb720e4c49816975f57e23742a1ed4a64b43042","sha256:bae8aa537eaeba893b6db551d17fedac8a385f1db1dc56c0f34f4cb9ba6dd08a","sha256:06a98e7c741d189dec5340c0ca26d17bb4474235224f90fd12d77a5b5f6e5c4f"],"state_sha256":"0dc61a68bdd9d0c09f00458b5b181afedba09bcdfc92c9f19222d19489a9a8be"}