{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:WN5NGGKGWZCNYQJCME6FZ32YDZ","short_pith_number":"pith:WN5NGGKG","schema_version":"1.0","canonical_sha256":"b37ad31946b644dc4122613c5cef581e62ae96a3acbbf656ca39fc598d7a9411","source":{"kind":"arxiv","id":"2305.10415","version":6},"attestation_state":"computed","paper":{"title":"PMC-VQA: Visual Instruction Tuning for Medical Visual Question Answering","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A generative model trained on a 227k-pair medical VQA dataset from literature outperforms prior systems on clinical benchmarks after fine-tuning.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chaoyi Wu, Weidi Xie, Weixiong Lin, Xiaoman Zhang, Yanfeng Wang, Ya Zhang, Ziheng Zhao","submitted_at":"2023-05-17T17:50:16Z","abstract_excerpt":"Medical Visual Question Answering (MedVQA) presents a significant opportunity to enhance diagnostic accuracy and healthcare delivery by leveraging artificial intelligence to interpret and answer questions based on medical images. In this study, we reframe the problem of MedVQA as a generation task that naturally follows the human-machine interaction and propose a generative-based model for medical visual understanding by aligning visual information from a pre-trained vision encoder with a large language model. We establish a scalable pipeline to construct a large-scale medical visual question-"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2305.10415","kind":"arxiv","version":6},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-05-17T17:50:16Z","cross_cats_sorted":[],"title_canon_sha256":"d54473c845024f3af2c48d2eec48eff4f4ba48a71417a05da866311e26db8ccc","abstract_canon_sha256":"814d4467633f689e887d433655429ab04ad7e5dfc245da5ae3c8e4e1774645c4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.837041Z","signature_b64":"fC4oQnDROrN/3WqgOnXZmtRbiq8Np8aknZqVctUeQ3ZsAVGJcLrIUaQdmSdqVhrv+I45jKrcorF5eBrG+S7/BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b37ad31946b644dc4122613c5cef581e62ae96a3acbbf656ca39fc598d7a9411","last_reissued_at":"2026-05-17T23:38:49.836419Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.836419Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"PMC-VQA: Visual Instruction Tuning for Medical Visual Question Answering","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A generative model trained on a 227k-pair medical VQA dataset from literature outperforms prior systems on clinical benchmarks after fine-tuning.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chaoyi Wu, Weidi Xie, Weixiong Lin, Xiaoman Zhang, Yanfeng Wang, Ya Zhang, Ziheng Zhao","submitted_at":"2023-05-17T17:50:16Z","abstract_excerpt":"Medical Visual Question Answering (MedVQA) presents a significant opportunity to enhance diagnostic accuracy and healthcare delivery by leveraging artificial intelligence to interpret and answer questions based on medical images. In this study, we reframe the problem of MedVQA as a generation task that naturally follows the human-machine interaction and propose a generative-based model for medical visual understanding by aligning visual information from a pre-trained vision encoder with a large language model. We establish a scalable pipeline to construct a large-scale medical visual question-"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We train the proposed model on PMC-VQA and then fine-tune it on multiple public benchmarks, e.g., VQA-RAD, SLAKE, and Image-Clef-2019, significantly outperforming existing MedVQA models in generating relevant, accurate free-form answers.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The PMC-VQA dataset constructed from literature sources provides representative coverage of real clinical images and questions without systematic biases from publication practices or selection effects.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"PMC-VQA dataset and MedVInT model achieve better generative performance on medical VQA benchmarks by visual instruction tuning on a newly constructed large-scale dataset.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A generative model trained on a 227k-pair medical VQA dataset from literature outperforms prior systems on clinical benchmarks after fine-tuning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"68cdf44249ac1c398bdd514dc9fe0028613f23bb9ec9df43bb6d46cd12503672"},"source":{"id":"2305.10415","kind":"arxiv","version":6},"verdict":{"id":"1281377d-2e49-4405-ba60-536e1872865e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T23:04:29.314082Z","strongest_claim":"We train the proposed model on PMC-VQA and then fine-tune it on multiple public benchmarks, e.g., VQA-RAD, SLAKE, and Image-Clef-2019, significantly outperforming existing MedVQA models in generating relevant, accurate free-form answers.","one_line_summary":"PMC-VQA dataset and MedVInT model achieve better generative performance on medical VQA benchmarks by visual instruction tuning on a newly constructed large-scale dataset.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The PMC-VQA dataset constructed from literature sources provides representative coverage of real clinical images and questions without systematic biases from publication practices or selection effects.","pith_extraction_headline":"A generative model trained on a 227k-pair medical VQA dataset from literature outperforms prior systems on clinical benchmarks after fine-tuning."},"references":{"count":64,"sample":[{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning","work_id":"40995f15-58e1-4bdb-8885-4ad729de9a28","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning","work_id":"597da3a9-ab88-4b28-af45-6a03d98fe19d","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"The medical segmentation decathlon.Nature Communications, 13(1):4128, 2022","work_id":"e1ca1c39-044b-48fe-adb2-e00047529c1f","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Anas Awadalla, Irena Gao, Joshua Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Jenia Jitsev, et al. Openflamingo, 2023","work_id":"8f318ea8-ad3c-4346-90e6-758be1279df6","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Artificial intelligence in healthcare: transforming the practice of medicine.Future healthcare journal, 8(2):e188–e194, 2021","work_id":"6d717124-ffbe-4a1b-a257-0d7c77516cf8","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":64,"snapshot_sha256":"c5af3927d5cd4d741314f5e228d044ef2666dee4c01cae43cbb08c6ee9d88fb8","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"9f3a2619215f7d780091b241301df0b6e5756c2f4b1fa911559822a7af013136"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2305.10415","created_at":"2026-05-17T23:38:49.836522+00:00"},{"alias_kind":"arxiv_version","alias_value":"2305.10415v6","created_at":"2026-05-17T23:38:49.836522+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2305.10415","created_at":"2026-05-17T23:38:49.836522+00:00"},{"alias_kind":"pith_short_12","alias_value":"WN5NGGKGWZCN","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"WN5NGGKGWZCNYQJC","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"WN5NGGKG","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":37,"internal_anchor_count":37,"sample":[{"citing_arxiv_id":"2605.22872","citing_title":"MedExpMem: Adapting Experience Memory for Differential Diagnosis","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07919","citing_title":"MedVIGIL: Evaluating Trustworthy Medical VLMs Under Broken Visual Evidence","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22080","citing_title":"JMed48k: A Multi-Profession Japanese Medical Licensing Benchmark for Vision-Language Model Evaluation","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20525","citing_title":"NeuroQA: A Large-Scale Image-Grounded Benchmark for 3D Brain MRI Understanding","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26283","citing_title":"MedSynapse-V: Bridging Visual Perception and Clinical Intuition via Latent Memory Evolution","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15561","citing_title":"RoiMAM: Region-of-Interest Medical Attention Model for Efficient Vision-Language Understanding","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15736","citing_title":"BiomedAP: A Vision-Informed Dual-Anchor Framework with Gated Cross-Modal Fusion for Robust Medical Vision-Language Adaptation","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18111","citing_title":"How Good LLMs Are at Answering Bangla Medical Visual Questions? Dataset and Benchmarking","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19359","citing_title":"MAM-CLIP: Vision-Language Pretraining on Mammography Atlases for BI-RADS Classification","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15766","citing_title":"BioXArena: Benchmarking LLM Agents on Multi-Modal Biomedical Machine Learning Tasks","ref_index":98,"is_internal_anchor":true},{"citing_arxiv_id":"2510.18117","citing_title":"Online In-Context Distillation for Low-Resource Vision Language Models","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2402.13116","citing_title":"A Survey on Knowledge Distillation of Large Language Models","ref_index":239,"is_internal_anchor":true},{"citing_arxiv_id":"2512.18073","citing_title":"FPBench: A Comprehensive Benchmark of Multimodal Large Language Models for Fingerprint Analysis","ref_index":61,"is_internal_anchor":true},{"citing_arxiv_id":"2512.22278","citing_title":"FETAL-GAUGE: A Benchmark for Assessing Vision-Language Models in Fetal Ultrasound","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2503.07536","citing_title":"LMM-R1: Empowering 3B LMMs with Strong Reasoning Abilities Through Two-Stage Rule-Based RL","ref_index":94,"is_internal_anchor":true},{"citing_arxiv_id":"2503.12937","citing_title":"R1-VL: Learning to Reason with Multimodal Large Language Models via Step-wise Group Relative Policy Optimization","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2602.12286","citing_title":"Mind the Gap No More: Achieving Zero-Gap Multimodal Integration via One Tokenizer","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2306.13549","citing_title":"A Survey on Multimodal Large Language Models","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2602.12705","citing_title":"MedXIAOHE: A Comprehensive Recipe for Building Medical MLLMs","ref_index":73,"is_internal_anchor":true},{"citing_arxiv_id":"2603.06665","citing_title":"Better Eyes, Better Thoughts: Why Vision Chain-of-Thought Fails in Medicine","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2603.24649","citing_title":"MedOpenClaw and MedFlowBench: Auditing Medical Agents in Full-Study Workflows","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03426","citing_title":"Replacing Parameters with Preferences: Federated Alignment of Heterogeneous Vision-Language Models","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26288","citing_title":"CheXthought: A global multimodal dataset of clinical chain-of-thought reasoning and visual attention for chest X-ray interpretation","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26283","citing_title":"MedSynapse-V: Bridging Visual Perception and Clinical Intuition via Latent Memory Evolution","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09384","citing_title":"LiteMedCoT-VL: Parameter-Efficient Adaptation for Medical Visual Question Answering","ref_index":18,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ","json":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ.json","graph_json":"https://pith.science/api/pith-number/WN5NGGKGWZCNYQJCME6FZ32YDZ/graph.json","events_json":"https://pith.science/api/pith-number/WN5NGGKGWZCNYQJCME6FZ32YDZ/events.json","paper":"https://pith.science/paper/WN5NGGKG"},"agent_actions":{"view_html":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ","download_json":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ.json","view_paper":"https://pith.science/paper/WN5NGGKG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2305.10415&json=true","fetch_graph":"https://pith.science/api/pith-number/WN5NGGKGWZCNYQJCME6FZ32YDZ/graph.json","fetch_events":"https://pith.science/api/pith-number/WN5NGGKGWZCNYQJCME6FZ32YDZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ/action/storage_attestation","attest_author":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ/action/author_attestation","sign_citation":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ/action/citation_signature","submit_replication":"https://pith.science/pith/WN5NGGKGWZCNYQJCME6FZ32YDZ/action/replication_record"}},"created_at":"2026-05-17T23:38:49.836522+00:00","updated_at":"2026-05-17T23:38:49.836522+00:00"}