{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PRFNZFHJKF2VYAVCLABITLN2DE","short_pith_number":"pith:PRFNZFHJ","schema_version":"1.0","canonical_sha256":"7c4adc94e951755c02a2580289adba191a7d945bd18269a45863b74bd15beec9","source":{"kind":"arxiv","id":"2606.12250","version":1},"attestation_state":"computed","paper":{"title":"Reassessing High-Performing LLMs on Polish Medical Exams: True Competence or Bias-Driven Performance?","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Antoni Lasik, Gabriela Korza\\'nska, Jakub Pokrywka, Jakub Tomasz D\\k{a}browski, Janusz \\'Swieczkowski-Feiz, Jeremi Ignacy Kaczmarek, {\\L}ukasz Grzybowski, Oskar Pastuszek, Paulina Hoffman, Wojciech Kusa","submitted_at":"2026-06-10T15:52:24Z","abstract_excerpt":"Large language models (LLMs) in medicine are mainly evaluated using multiple-choice question answering (MCQA), which can overestimate real clinical ability due to guessing strategies and answer biases. To address these limitations, we introduce an expanded and more challenging benchmark based on Polish medical exams, adding over 15,000 questions, two new domains, and four structural modifications that reduce MCQA-specific artifacts and better test reasoning. We evaluate 21 LLMs and show that evaluation design strongly affects results. Under our harder setup, the best model (Qwen3.5-122B) drops"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.12250","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-10T15:52:24Z","cross_cats_sorted":[],"title_canon_sha256":"efc5f2cb2c60a9e4de55803182a25cdc1a5cb9d0a95da881a3f1a63076e27794","abstract_canon_sha256":"736456898540a691b08eb7d88c1d5294d83a8c9749260d1c17437af70d84a986"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-11T01:10:56.884612Z","signature_b64":"hzLn+MjZz56yRmQlG2UrnVhbgbMf6LNOX1qkIej8h+31j3IFC7TEwCK+xprfDjDkRsRYjL7I81Mm/HKXe9W7CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7c4adc94e951755c02a2580289adba191a7d945bd18269a45863b74bd15beec9","last_reissued_at":"2026-06-11T01:10:56.883747Z","signature_status":"signed_v1","first_computed_at":"2026-06-11T01:10:56.883747Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Reassessing High-Performing LLMs on Polish Medical Exams: True Competence or Bias-Driven Performance?","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Antoni Lasik, Gabriela Korza\\'nska, Jakub Pokrywka, Jakub Tomasz D\\k{a}browski, Janusz \\'Swieczkowski-Feiz, Jeremi Ignacy Kaczmarek, {\\L}ukasz Grzybowski, Oskar Pastuszek, Paulina Hoffman, Wojciech Kusa","submitted_at":"2026-06-10T15:52:24Z","abstract_excerpt":"Large language models (LLMs) in medicine are mainly evaluated using multiple-choice question answering (MCQA), which can overestimate real clinical ability due to guessing strategies and answer biases. To address these limitations, we introduce an expanded and more challenging benchmark based on Polish medical exams, adding over 15,000 questions, two new domains, and four structural modifications that reduce MCQA-specific artifacts and better test reasoning. We evaluate 21 LLMs and show that evaluation design strongly affects results. Under our harder setup, the best model (Qwen3.5-122B) drops"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.12250","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.12250/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.12250","created_at":"2026-06-11T01:10:56.883892+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.12250v1","created_at":"2026-06-11T01:10:56.883892+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.12250","created_at":"2026-06-11T01:10:56.883892+00:00"},{"alias_kind":"pith_short_12","alias_value":"PRFNZFHJKF2V","created_at":"2026-06-11T01:10:56.883892+00:00"},{"alias_kind":"pith_short_16","alias_value":"PRFNZFHJKF2VYAVC","created_at":"2026-06-11T01:10:56.883892+00:00"},{"alias_kind":"pith_short_8","alias_value":"PRFNZFHJ","created_at":"2026-06-11T01:10:56.883892+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE","json":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE.json","graph_json":"https://pith.science/api/pith-number/PRFNZFHJKF2VYAVCLABITLN2DE/graph.json","events_json":"https://pith.science/api/pith-number/PRFNZFHJKF2VYAVCLABITLN2DE/events.json","paper":"https://pith.science/paper/PRFNZFHJ"},"agent_actions":{"view_html":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE","download_json":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE.json","view_paper":"https://pith.science/paper/PRFNZFHJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.12250&json=true","fetch_graph":"https://pith.science/api/pith-number/PRFNZFHJKF2VYAVCLABITLN2DE/graph.json","fetch_events":"https://pith.science/api/pith-number/PRFNZFHJKF2VYAVCLABITLN2DE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE/action/storage_attestation","attest_author":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE/action/author_attestation","sign_citation":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE/action/citation_signature","submit_replication":"https://pith.science/pith/PRFNZFHJKF2VYAVCLABITLN2DE/action/replication_record"}},"created_at":"2026-06-11T01:10:56.883892+00:00","updated_at":"2026-06-11T01:10:56.883892+00:00"}