{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:Q5LCUAXK6I3XX7TV4XPBBFSOYN","short_pith_number":"pith:Q5LCUAXK","schema_version":"1.0","canonical_sha256":"87562a02eaf2377bfe75e5de10964ec342454a85ce49d4ddfa4c20b6969766ba","source":{"kind":"arxiv","id":"2501.01926","version":3},"attestation_state":"computed","paper":{"title":"Cross-Modal Attention Calibration for LVLM Hallucination Mitigation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Guanbin Li, Jiacheng Zhang, Jiaming Li, Lin Ma, Zequn Jie","submitted_at":"2025-01-03T17:56:28Z","abstract_excerpt":"Large vision-language models (LVLMs) have shown remarkable capabilities in visual-language understanding. Despite their success, LVLMs still suffer from generating hallucinations in complex generation tasks, leading to inconsistencies between visual inputs and generated content. To address this issue, some approaches have introduced inference-time interventions, such as contrastive decoding, to reduce overreliance on language priors. However, these approaches overlook hallucinations stemming from position bias and spurious inter-modality correlations. In this paper, we propose a Cross-Modal At"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2501.01926","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-01-03T17:56:28Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"2c0060ba411dd04cb31d416e03f2ab712ea9cb5d59da10d086053312be271c9c","abstract_canon_sha256":"d1dd5ab7c408a6840f19dafa74f2740c6ff0486131904e5d18097982c9328034"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-01T01:02:14.218282Z","signature_b64":"IBjIFzJZRpN4QvVdndQEC0qyK4hhFH35W97XcaUhvZ6gvhbs/ZEexRXT2xW2smYzOH/yiY47I2u0fAy9cT6rDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"87562a02eaf2377bfe75e5de10964ec342454a85ce49d4ddfa4c20b6969766ba","last_reissued_at":"2026-06-01T01:02:14.216824Z","signature_status":"signed_v1","first_computed_at":"2026-06-01T01:02:14.216824Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Cross-Modal Attention Calibration for LVLM Hallucination Mitigation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Guanbin Li, Jiacheng Zhang, Jiaming Li, Lin Ma, Zequn Jie","submitted_at":"2025-01-03T17:56:28Z","abstract_excerpt":"Large vision-language models (LVLMs) have shown remarkable capabilities in visual-language understanding. Despite their success, LVLMs still suffer from generating hallucinations in complex generation tasks, leading to inconsistencies between visual inputs and generated content. To address this issue, some approaches have introduced inference-time interventions, such as contrastive decoding, to reduce overreliance on language priors. However, these approaches overlook hallucinations stemming from position bias and spurious inter-modality correlations. In this paper, we propose a Cross-Modal At"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2501.01926","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2501.01926/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2501.01926","created_at":"2026-06-01T01:02:14.216999+00:00"},{"alias_kind":"arxiv_version","alias_value":"2501.01926v3","created_at":"2026-06-01T01:02:14.216999+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.01926","created_at":"2026-06-01T01:02:14.216999+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q5LCUAXK6I3X","created_at":"2026-06-01T01:02:14.216999+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q5LCUAXK6I3XX7TV","created_at":"2026-06-01T01:02:14.216999+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q5LCUAXK","created_at":"2026-06-01T01:02:14.216999+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":5,"internal_anchor_count":5,"sample":[{"citing_arxiv_id":"2605.10676","citing_title":"Not Blind but Silenced: Rebalancing Vision and Language via Adversarial Counter-Commonsense Equilibrium","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04874","citing_title":"Uncertainty-Aware Exploratory Direct Preference Optimization for Multimodal Large Language Models","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2404.18930","citing_title":"Hallucination of Multimodal Large Language Models: A Survey","ref_index":104,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21027","citing_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","ref_index":242,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04641","citing_title":"CAST: Mitigating Object Hallucination in Large Vision-Language Models via Caption-Guided Visual Attention Steering","ref_index":67,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN","json":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN.json","graph_json":"https://pith.science/api/pith-number/Q5LCUAXK6I3XX7TV4XPBBFSOYN/graph.json","events_json":"https://pith.science/api/pith-number/Q5LCUAXK6I3XX7TV4XPBBFSOYN/events.json","paper":"https://pith.science/paper/Q5LCUAXK"},"agent_actions":{"view_html":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN","download_json":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN.json","view_paper":"https://pith.science/paper/Q5LCUAXK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2501.01926&json=true","fetch_graph":"https://pith.science/api/pith-number/Q5LCUAXK6I3XX7TV4XPBBFSOYN/graph.json","fetch_events":"https://pith.science/api/pith-number/Q5LCUAXK6I3XX7TV4XPBBFSOYN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN/action/storage_attestation","attest_author":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN/action/author_attestation","sign_citation":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN/action/citation_signature","submit_replication":"https://pith.science/pith/Q5LCUAXK6I3XX7TV4XPBBFSOYN/action/replication_record"}},"created_at":"2026-06-01T01:02:14.216999+00:00","updated_at":"2026-06-01T01:02:14.216999+00:00"}