{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:VQ7HHZG6MLKLTK5PMT4NCVQZBA","short_pith_number":"pith:VQ7HHZG6","canonical_record":{"source":{"id":"2605.22612","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CY","submitted_at":"2026-05-21T15:27:58Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"e95c8a4650a21fd1fe0229d777be117b4b0854c9b21fd18d3e534bf9e563923e","abstract_canon_sha256":"93a65c9fa74c413acdaaf8990b39e73159b5358340b06d2564528c353ed91e4c"},"schema_version":"1.0"},"canonical_sha256":"ac3e73e4de62d4b9abaf64f8d156190818de6184442d1a61ba3ca691c5b5389a","source":{"kind":"arxiv","id":"2605.22612","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.22612","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"arxiv_version","alias_value":"2605.22612v1","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.22612","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"pith_short_12","alias_value":"VQ7HHZG6MLKL","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"pith_short_16","alias_value":"VQ7HHZG6MLKLTK5P","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"pith_short_8","alias_value":"VQ7HHZG6","created_at":"2026-05-22T01:05:00Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:VQ7HHZG6MLKLTK5PMT4NCVQZBA","target":"record","payload":{"canonical_record":{"source":{"id":"2605.22612","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CY","submitted_at":"2026-05-21T15:27:58Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"e95c8a4650a21fd1fe0229d777be117b4b0854c9b21fd18d3e534bf9e563923e","abstract_canon_sha256":"93a65c9fa74c413acdaaf8990b39e73159b5358340b06d2564528c353ed91e4c"},"schema_version":"1.0"},"canonical_sha256":"ac3e73e4de62d4b9abaf64f8d156190818de6184442d1a61ba3ca691c5b5389a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:05:00.305160Z","signature_b64":"K9Vs3oBcGtpf0Xyi5TXxajpPFGkQDZNblNzMCQDqWTom/HRVvfEWP4pSHfU0vGQSmVhOyPFpxw9Q68JmW3uMAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ac3e73e4de62d4b9abaf64f8d156190818de6184442d1a61ba3ca691c5b5389a","last_reissued_at":"2026-05-22T01:05:00.304306Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:05:00.304306Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.22612","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:05:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Eu9bCkw5yNrHEiYeIYtVYkSfxQ19iFtrbbwHFKtlTgSOCyxrUZgt35awzUBfe1/Jedk8EkL8z+MRUw8YbJH+Dw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T12:01:07.566910Z"},"content_sha256":"0365670b10c3127acffde5971d0e327378f2264301485b29a60cba4aca431481","schema_version":"1.0","event_id":"sha256:0365670b10c3127acffde5971d0e327378f2264301485b29a60cba4aca431481"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:VQ7HHZG6MLKLTK5PMT4NCVQZBA","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Healthcare LLM Benchmarks Are Only as Good as Their Explicit Assumptions","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CY","authors_text":"Bryan Wilder, Fei Fang, Mateo Dulce Rubio, Naveen Raman, Santiago Cortes-Gomez","submitted_at":"2026-05-21T15:27:58Z","abstract_excerpt":"Benchmarks are necessary for healthcare evaluation, but are not sufficient for predicting deployment performance. Our position is that the evaluation--deployment gap arises not because of poorly designed benchmarks, but from implicit assumptions about how users interact with models that cannot be surfaced from benchmarks alone. To make this precise, we propose a classification of assumptions into two categories: task, which can be tested from conversation data alone, and outcome, which requires outcome data and behavioral studies for testing. Critically, outcome assumptions depend on human beh"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.22612","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.22612/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-22T01:05:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yYMKsZfAJfm8h/Q0+9D4VYAYiBKuTDwsKsIdtFi2xQLnZSxWi090MiQBMjy8rlrqa916EeRBjbwC71vi7CnaDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T12:01:07.567806Z"},"content_sha256":"c0648fe651480a3a0621b7cc71f497ce03f9f2b52ff73d42654df71c0eb845cf","schema_version":"1.0","event_id":"sha256:c0648fe651480a3a0621b7cc71f497ce03f9f2b52ff73d42654df71c0eb845cf"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA/bundle.json","state_url":"https://pith.science/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-23T12:01:07Z","links":{"resolver":"https://pith.science/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA","bundle":"https://pith.science/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA/bundle.json","state":"https://pith.science/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VQ7HHZG6MLKLTK5PMT4NCVQZBA/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:VQ7HHZG6MLKLTK5PMT4NCVQZBA","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"93a65c9fa74c413acdaaf8990b39e73159b5358340b06d2564528c353ed91e4c","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CY","submitted_at":"2026-05-21T15:27:58Z","title_canon_sha256":"e95c8a4650a21fd1fe0229d777be117b4b0854c9b21fd18d3e534bf9e563923e"},"schema_version":"1.0","source":{"id":"2605.22612","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.22612","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"arxiv_version","alias_value":"2605.22612v1","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.22612","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"pith_short_12","alias_value":"VQ7HHZG6MLKL","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"pith_short_16","alias_value":"VQ7HHZG6MLKLTK5P","created_at":"2026-05-22T01:05:00Z"},{"alias_kind":"pith_short_8","alias_value":"VQ7HHZG6","created_at":"2026-05-22T01:05:00Z"}],"graph_snapshots":[{"event_id":"sha256:c0648fe651480a3a0621b7cc71f497ce03f9f2b52ff73d42654df71c0eb845cf","target":"graph","created_at":"2026-05-22T01:05:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.22612/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Benchmarks are necessary for healthcare evaluation, but are not sufficient for predicting deployment performance. Our position is that the evaluation--deployment gap arises not because of poorly designed benchmarks, but from implicit assumptions about how users interact with models that cannot be surfaced from benchmarks alone. To make this precise, we propose a classification of assumptions into two categories: task, which can be tested from conversation data alone, and outcome, which requires outcome data and behavioral studies for testing. Critically, outcome assumptions depend on human beh","authors_text":"Bryan Wilder, Fei Fang, Mateo Dulce Rubio, Naveen Raman, Santiago Cortes-Gomez","cross_cats":["cs.AI","cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CY","submitted_at":"2026-05-21T15:27:58Z","title":"Healthcare LLM Benchmarks Are Only as Good as Their Explicit Assumptions"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.22612","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0365670b10c3127acffde5971d0e327378f2264301485b29a60cba4aca431481","target":"record","created_at":"2026-05-22T01:05:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"93a65c9fa74c413acdaaf8990b39e73159b5358340b06d2564528c353ed91e4c","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CY","submitted_at":"2026-05-21T15:27:58Z","title_canon_sha256":"e95c8a4650a21fd1fe0229d777be117b4b0854c9b21fd18d3e534bf9e563923e"},"schema_version":"1.0","source":{"id":"2605.22612","kind":"arxiv","version":1}},"canonical_sha256":"ac3e73e4de62d4b9abaf64f8d156190818de6184442d1a61ba3ca691c5b5389a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ac3e73e4de62d4b9abaf64f8d156190818de6184442d1a61ba3ca691c5b5389a","first_computed_at":"2026-05-22T01:05:00.304306Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-22T01:05:00.304306Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"K9Vs3oBcGtpf0Xyi5TXxajpPFGkQDZNblNzMCQDqWTom/HRVvfEWP4pSHfU0vGQSmVhOyPFpxw9Q68JmW3uMAw==","signature_status":"signed_v1","signed_at":"2026-05-22T01:05:00.305160Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.22612","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0365670b10c3127acffde5971d0e327378f2264301485b29a60cba4aca431481","sha256:c0648fe651480a3a0621b7cc71f497ce03f9f2b52ff73d42654df71c0eb845cf"],"state_sha256":"420694cd8214ced5c010045dc247df7627b44a8b45ebd09859ac0701af4940ad"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RO0xU7P8gIAOAQx0POSIihKWGyD/DvF+FuKBpjdtZ6CpP8H+5pYM2RbcI81pRp0MnPsPpDKdTL9/ySYzXv7nCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-23T12:01:07.571714Z","bundle_sha256":"01a7ff25c9af6e19e3812b5ac14fe79e997aba82e0e433702d4d2c34ca0053de"}}