{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:5UGMR4BFC23VOTT3536VCKLDOX","short_pith_number":"pith:5UGMR4BF","schema_version":"1.0","canonical_sha256":"ed0cc8f02516b7574e7beefd51296375e3a59b4d753fffbd438f7f2847188086","source":{"kind":"arxiv","id":"2303.13375","version":2},"attestation_state":"computed","paper":{"title":"Capabilities of GPT-4 on Medical Challenge Problems","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"GPT-4 exceeds the USMLE passing score by over 20 points without any medical-specific training or prompts.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Dean Carignan, Eric Horvitz, Harsha Nori, Nicholas King, Scott Mayer McKinney","submitted_at":"2023-03-20T16:18:38Z","abstract_excerpt":"Large language models (LLMs) have demonstrated remarkable capabilities in natural language understanding and generation across various domains, including medicine. We present a comprehensive evaluation of GPT-4, a state-of-the-art LLM, on medical competency examinations and benchmark datasets. GPT-4 is a general-purpose model that is not specialized for medical problems through training or engineered to solve clinical tasks. Our analysis covers two sets of official practice materials for the USMLE, a three-step examination program used to assess clinical competency and grant licensure in the U"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2303.13375","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2023-03-20T16:18:38Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"cfc8ec73effe4ce55f50dbb8ad31cd9b988236739f35cad6e46b2fc5cdb20ec1","abstract_canon_sha256":"38b3de569bb0529923d8389fe1f0282ce3628f0db9ade774bc774476b9ac5af4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.415977Z","signature_b64":"Q28IDnrmWJmUwDC11W8OwoWl3/USl14keQtaYGFfJ6HjfCRv5Yo4KuJUZMMhgVdXRmLvXjYl9wz9fvUpuIoNCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ed0cc8f02516b7574e7beefd51296375e3a59b4d753fffbd438f7f2847188086","last_reissued_at":"2026-05-17T23:38:52.415467Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.415467Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Capabilities of GPT-4 on Medical Challenge Problems","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"GPT-4 exceeds the USMLE passing score by over 20 points without any medical-specific training or prompts.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Dean Carignan, Eric Horvitz, Harsha Nori, Nicholas King, Scott Mayer McKinney","submitted_at":"2023-03-20T16:18:38Z","abstract_excerpt":"Large language models (LLMs) have demonstrated remarkable capabilities in natural language understanding and generation across various domains, including medicine. We present a comprehensive evaluation of GPT-4, a state-of-the-art LLM, on medical competency examinations and benchmark datasets. GPT-4 is a general-purpose model that is not specialized for medical problems through training or engineered to solve clinical tasks. Our analysis covers two sets of official practice materials for the USMLE, a three-step examination program used to assess clinical competency and grant licensure in the U"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GPT-4, without any specialized prompt crafting, exceeds the passing score on USMLE by over 20 points and outperforms earlier general-purpose models (GPT-3.5) as well as models specifically fine-tuned on medical knowledge (Med-PaLM).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The official USMLE practice materials used are representative of the actual exam content and difficulty, and the model has not memorized the specific questions during pre-training (probed but not fully detailed in available text).","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GPT-4 exceeds the USMLE passing score by more than 20 points and outperforms both GPT-3.5 and the medically fine-tuned Med-PaLM on the MultiMedQA benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"GPT-4 exceeds the USMLE passing score by over 20 points without any medical-specific training or prompts.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"aa5c4eb7d7207b2a0f322277a88920b29e5aa820fc6a70938f87f5349b2f9c45"},"source":{"id":"2303.13375","kind":"arxiv","version":2},"verdict":{"id":"9cb7a1f6-64b4-4b96-a8d5-8b5435604c7b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T13:39:07.226053Z","strongest_claim":"GPT-4, without any specialized prompt crafting, exceeds the passing score on USMLE by over 20 points and outperforms earlier general-purpose models (GPT-3.5) as well as models specifically fine-tuned on medical knowledge (Med-PaLM).","one_line_summary":"GPT-4 exceeds the USMLE passing score by more than 20 points and outperforms both GPT-3.5 and the medically fine-tuned Med-PaLM on the MultiMedQA benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The official USMLE practice materials used are representative of the actual exam content and difficulty, and the model has not memorized the specific questions during pre-training (probed but not fully detailed in available text).","pith_extraction_headline":"GPT-4 exceeds the USMLE passing score by over 20 points without any medical-specific training or prompts."},"references":{"count":23,"sample":[{"doi":"","year":2019,"title":"Guidelines for human-AI interaction","work_id":"1915e653-ede4-4aa0-8b7b-ff8f5553f406","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1901,"title":"Lan- guage models are few-shot learners","work_id":"75db645b-ae81-481a-bdbc-12b3f493d004","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding","work_id":"ed240a10-5b19-406c-baa5-30803f465785","ref_index":3,"cited_arxiv_id":"1810.04805","is_internal_anchor":true},{"doi":"","year":1951,"title":"Automated identiﬁcation of adults at risk for in-hospital clinical deterioration","work_id":"c8446f66-fd7d-4d21-bee4-df87f15d6612","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Who goes ﬁrst? Inﬂuences of human-ai workﬂow on decision making in clinical imaging","work_id":"22b87454-3f97-4f73-8349-554d6fdc39c6","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":23,"snapshot_sha256":"a642b93a7c1f94352111c4390b17bf3f3284ffcf42f266171218c391874f291a","internal_anchors":8},"formal_canon":{"evidence_count":1,"snapshot_sha256":"25353279e17364f83947f1997e60edd69bc0e9ff5a17263edda515bbdb3b388d"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2303.13375","created_at":"2026-05-17T23:38:52.415539+00:00"},{"alias_kind":"arxiv_version","alias_value":"2303.13375v2","created_at":"2026-05-17T23:38:52.415539+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2303.13375","created_at":"2026-05-17T23:38:52.415539+00:00"},{"alias_kind":"pith_short_12","alias_value":"5UGMR4BFC23V","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"5UGMR4BFC23VOTT3","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"5UGMR4BF","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":40,"internal_anchor_count":40,"sample":[{"citing_arxiv_id":"2306.00890","citing_title":"LLaVA-Med: Training a Large Language-and-Vision Assistant for Biomedicine in One Day","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2305.09617","citing_title":"Towards Expert-Level Medical Question Answering with Large Language Models","ref_index":110,"is_internal_anchor":true},{"citing_arxiv_id":"2401.02458","citing_title":"Data-Centric Foundation Models in Computational Healthcare: A Survey","ref_index":211,"is_internal_anchor":true},{"citing_arxiv_id":"2410.14702","citing_title":"Polymath: A Challenging Multi-modal Mathematical Reasoning Benchmark","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2410.18856","citing_title":"Entry-level guide to the use of large language models for medical research","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2410.21276","citing_title":"GPT-4o System Card","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2502.05740","citing_title":"RECOVER: Designing a Large Language Model-based Remote Patient Monitoring System for Postoperative Gastrointestinal Cancer Care","ref_index":72,"is_internal_anchor":true},{"citing_arxiv_id":"2502.16022","citing_title":"Enhancing LLMs for Identifying and Prioritizing Important Medical Jargons from Electronic Health Record Notes Utilizing Data Augmentation","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22080","citing_title":"JMed48k: A Multi-Profession Japanese Medical Licensing Benchmark for Vision-Language Model Evaluation","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21949","citing_title":"Claim-Selective Certification for High-Risk Medical Retrieval-Augmented Generation","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22047","citing_title":"Active Evidence-Seeking and Diagnostic Reasoning in Large Language Models for Clinical Decision Support","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17694","citing_title":"Do LLM Agents Mirror Socio-Cognitive Effects in Power-Asymmetric Conversations?","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17694","citing_title":"Do LLM Agents Mirror Socio-Cognitive Effects in Power-Asymmetric Conversations?","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19173","citing_title":"Prompting language influences diagnostic reasoning and accuracy of large language models","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2511.05501","citing_title":"Towards Real-World Validity in Generative AI Benchmarks: Understanding and Designing Domain-Centered Evaluations for Journalism Practitioners","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08549","citing_title":"VerifAI: A Verifiable Open-Source Search Engine for Biomedical Question Answering","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2601.13262","citing_title":"CURE-Med: Curriculum-Informed Reinforcement Learning for Multilingual Medical Reasoning","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2602.07529","citing_title":"MedVerse: Efficient and Reliable Medical Reasoning via DAG-Structured Parallel Execution","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2305.10415","citing_title":"PMC-VQA: Visual Instruction Tuning for Medical Visual Question Answering","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15331","citing_title":"How people use Copilot for Health","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2401.18059","citing_title":"RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval","ref_index":102,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08559","citing_title":"Medical Reasoning with Large Language Models: A Survey and MR-Bench","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14543","citing_title":"RxEval: A Prescription-Level Benchmark for Evaluating LLM Medication Recommendation","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09505","citing_title":"EpiGraph: Building Generalists for Evidence-Intensive Epilepsy Reasoning in the Wild","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28325","citing_title":"Building evidence-based knowledge bases from full-text literature for disease-specific biomedical reasoning","ref_index":39,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX","json":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX.json","graph_json":"https://pith.science/api/pith-number/5UGMR4BFC23VOTT3536VCKLDOX/graph.json","events_json":"https://pith.science/api/pith-number/5UGMR4BFC23VOTT3536VCKLDOX/events.json","paper":"https://pith.science/paper/5UGMR4BF"},"agent_actions":{"view_html":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX","download_json":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX.json","view_paper":"https://pith.science/paper/5UGMR4BF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2303.13375&json=true","fetch_graph":"https://pith.science/api/pith-number/5UGMR4BFC23VOTT3536VCKLDOX/graph.json","fetch_events":"https://pith.science/api/pith-number/5UGMR4BFC23VOTT3536VCKLDOX/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX/action/timestamp_anchor","attest_storage":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX/action/storage_attestation","attest_author":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX/action/author_attestation","sign_citation":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX/action/citation_signature","submit_replication":"https://pith.science/pith/5UGMR4BFC23VOTT3536VCKLDOX/action/replication_record"}},"created_at":"2026-05-17T23:38:52.415539+00:00","updated_at":"2026-05-17T23:38:52.415539+00:00"}