{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:TIR5IYQ4BT5UU2TYPFXD5LXIS5","short_pith_number":"pith:TIR5IYQ4","schema_version":"1.0","canonical_sha256":"9a23d4621c0cfb4a6a78796e3eaee8974ba6f07fa10719a963535b3d4de74b80","source":{"kind":"arxiv","id":"2509.22258","version":5},"attestation_state":"computed","paper":{"title":"Beyond Classification Accuracy: Neural-MedBench and the Need for Deeper Reasoning Benchmarks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Vision-language models show major reasoning shortfalls on a new compact neurology benchmark despite high scores on standard tests.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Huan Gao, Junling Lin, Mengting Jia, Miao Jing, Mingkun Xu, Shangyang Li, Zhongxia Shen","submitted_at":"2025-09-26T12:20:01Z","abstract_excerpt":"Recent advances in vision-language models (VLMs) have achieved remarkable performance on standard medical benchmarks, yet their true clinical reasoning ability remains unclear. Existing datasets predominantly emphasize classification accuracy, creating an evaluation illusion in which models appear proficient while still failing at high-stakes diagnostic reasoning. We introduce Neural-MedBench, a compact yet reasoning-intensive benchmark specifically designed to probe the limits of multimodal clinical reasoning in neurology. Neural-MedBench integrates multi-sequence MRI scans, structured electr"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2509.22258","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-26T12:20:01Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"166b1aa83792ea3889f8f6c7ccba8e1572dc2a28c0c56ea32c0dd9d82e97401f","abstract_canon_sha256":"078d657566ca09838af6eaf4c59464c96e429a6955d5c2e011ef74d1978f4ac1"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:04:58.473992Z","signature_b64":"iVa+rEGcxqcqTayjmcGEnC8sxoNl8eDxdAWo2hHUwo3XKMErr6zKkptT9Hz2DL9AlmtHK+pavsO3DCFRUVRrBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9a23d4621c0cfb4a6a78796e3eaee8974ba6f07fa10719a963535b3d4de74b80","last_reissued_at":"2026-05-20T01:04:58.473052Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:04:58.473052Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Classification Accuracy: Neural-MedBench and the Need for Deeper Reasoning Benchmarks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Vision-language models show major reasoning shortfalls on a new compact neurology benchmark despite high scores on standard tests.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Huan Gao, Junling Lin, Mengting Jia, Miao Jing, Mingkun Xu, Shangyang Li, Zhongxia Shen","submitted_at":"2025-09-26T12:20:01Z","abstract_excerpt":"Recent advances in vision-language models (VLMs) have achieved remarkable performance on standard medical benchmarks, yet their true clinical reasoning ability remains unclear. Existing datasets predominantly emphasize classification accuracy, creating an evaluation illusion in which models appear proficient while still failing at high-stakes diagnostic reasoning. We introduce Neural-MedBench, a compact yet reasoning-intensive benchmark specifically designed to probe the limits of multimodal clinical reasoning in neurology. Neural-MedBench integrates multi-sequence MRI scans, structured electr"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Through systematic evaluation of state-of-the-art VLMs, including GPT-4o, Claude-4, and MedGemma, we observe a sharp performance drop compared to conventional datasets. Error analysis shows that reasoning failures, rather than perceptual errors, dominate model shortcomings.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The hybrid scoring pipeline (LLM-based graders combined with clinician validation and semantic similarity metrics) provides a reliable and unbiased measure of true clinical reasoning ability rather than introducing grader-specific artifacts or inconsistencies.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Neural-MedBench reveals sharp performance drops in state-of-the-art VLMs on reasoning-intensive neurology tasks compared to conventional classification benchmarks, with reasoning failures dominating errors.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Vision-language models show major reasoning shortfalls on a new compact neurology benchmark despite high scores on standard tests.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"67e563374957722bf7a4fdc8a1296fc6c84eeeaccf271738f29b79ea36acc157"},"source":{"id":"2509.22258","kind":"arxiv","version":5},"verdict":{"id":"21d05470-2c6a-424b-9594-8bb821af225f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-18T13:22:58.669012Z","strongest_claim":"Through systematic evaluation of state-of-the-art VLMs, including GPT-4o, Claude-4, and MedGemma, we observe a sharp performance drop compared to conventional datasets. Error analysis shows that reasoning failures, rather than perceptual errors, dominate model shortcomings.","one_line_summary":"Neural-MedBench reveals sharp performance drops in state-of-the-art VLMs on reasoning-intensive neurology tasks compared to conventional classification benchmarks, with reasoning failures dominating errors.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The hybrid scoring pipeline (LLM-based graders combined with clinician validation and semantic similarity metrics) provides a reliable and unbiased measure of true clinical reasoning ability rather than introducing grader-specific artifacts or inconsistencies.","pith_extraction_headline":"Vision-language models show major reasoning shortfalls on a new compact neurology benchmark despite high scores on standard tests."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.22258/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":42,"sample":[{"doi":"","year":2024,"title":"Anthropic. Claude haiku, 2024. URL https://www.anthropic.com","work_id":"929c77fd-26dd-40ca-a891-7c761867b5f3","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":2,"cited_arxiv_id":"2308.12966","is_internal_anchor":true},{"doi":"","year":1901,"title":"The liver tumor segmentation benchmark (lits)","work_id":"7e819396-daa0-43ab-b4b6-ef671170eb1b","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","ref_index":4,"cited_arxiv_id":"2507.06261","is_internal_anchor":true},{"doi":"","year":2025,"title":"Benchmarking generative ai for scoring medical student interviews in objective structured clinical examinations (osces)","work_id":"16604e5a-a3c5-42d8-90a2-b332c65f7924","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":42,"snapshot_sha256":"10cbd7d60fe4cd64328d51af60941b4a693b5643fb59979905bb9eca28981238","internal_anchors":7},"formal_canon":{"evidence_count":2,"snapshot_sha256":"eec1aa5f422cf2283c567b46a4aa30c5fa0f0c75be83af044696c9209b2a283d"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.22258","created_at":"2026-05-20T01:04:58.473178+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.22258v5","created_at":"2026-05-20T01:04:58.473178+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.22258","created_at":"2026-05-20T01:04:58.473178+00:00"},{"alias_kind":"pith_short_12","alias_value":"TIR5IYQ4BT5U","created_at":"2026-05-20T01:04:58.473178+00:00"},{"alias_kind":"pith_short_16","alias_value":"TIR5IYQ4BT5UU2TY","created_at":"2026-05-20T01:04:58.473178+00:00"},{"alias_kind":"pith_short_8","alias_value":"TIR5IYQ4","created_at":"2026-05-20T01:04:58.473178+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5","json":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5.json","graph_json":"https://pith.science/api/pith-number/TIR5IYQ4BT5UU2TYPFXD5LXIS5/graph.json","events_json":"https://pith.science/api/pith-number/TIR5IYQ4BT5UU2TYPFXD5LXIS5/events.json","paper":"https://pith.science/paper/TIR5IYQ4"},"agent_actions":{"view_html":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5","download_json":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5.json","view_paper":"https://pith.science/paper/TIR5IYQ4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.22258&json=true","fetch_graph":"https://pith.science/api/pith-number/TIR5IYQ4BT5UU2TYPFXD5LXIS5/graph.json","fetch_events":"https://pith.science/api/pith-number/TIR5IYQ4BT5UU2TYPFXD5LXIS5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5/action/storage_attestation","attest_author":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5/action/author_attestation","sign_citation":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5/action/citation_signature","submit_replication":"https://pith.science/pith/TIR5IYQ4BT5UU2TYPFXD5LXIS5/action/replication_record"}},"created_at":"2026-05-20T01:04:58.473178+00:00","updated_at":"2026-05-20T01:04:58.473178+00:00"}