{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:ZAH67EVVBNVQG3I2Y3WJGTMM3T","short_pith_number":"pith:ZAH67EVV","canonical_record":{"source":{"id":"2602.07842","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-08T07:03:27Z","cross_cats_sorted":[],"title_canon_sha256":"61c967da27cc1f80d51e4c6176ca6fab01fab3d7ae66c3193f1b864fafc1a7f2","abstract_canon_sha256":"84e1214712268694e6c33d4355c2019920243136bc809e0c248c758a2c63361d"},"schema_version":"1.0"},"canonical_sha256":"c80fef92b50b6b036d1ac6ec934d8cdcf3c2865a851d4ba84919485408707cc4","source":{"kind":"arxiv","id":"2602.07842","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.07842","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"arxiv_version","alias_value":"2602.07842v2","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.07842","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"pith_short_12","alias_value":"ZAH67EVVBNVQ","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"pith_short_16","alias_value":"ZAH67EVVBNVQG3I2","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"pith_short_8","alias_value":"ZAH67EVV","created_at":"2026-06-03T01:05:48Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:ZAH67EVVBNVQG3I2Y3WJGTMM3T","target":"record","payload":{"canonical_record":{"source":{"id":"2602.07842","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-08T07:03:27Z","cross_cats_sorted":[],"title_canon_sha256":"61c967da27cc1f80d51e4c6176ca6fab01fab3d7ae66c3193f1b864fafc1a7f2","abstract_canon_sha256":"84e1214712268694e6c33d4355c2019920243136bc809e0c248c758a2c63361d"},"schema_version":"1.0"},"canonical_sha256":"c80fef92b50b6b036d1ac6ec934d8cdcf3c2865a851d4ba84919485408707cc4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T01:05:48.757412Z","signature_b64":"uPju/eXrgRjJqbABy3a/xrQmMIBqrybjIz3quBypDx2VY94EqRNaRiS13W+6/LrMVYPm8R+4ln0ciHWcoSh7Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c80fef92b50b6b036d1ac6ec934d8cdcf3c2865a851d4ba84919485408707cc4","last_reissued_at":"2026-06-03T01:05:48.756949Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T01:05:48.756949Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.07842","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-03T01:05:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ZpdalZDCkGjXY+8+ejuykrMTyNuXF+JeQDuPzytyBzIBD34O4cKy/7/wFedSR0pPlPn+c8uNjfTxURbN/yf/DA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T12:55:11.663985Z"},"content_sha256":"783dcf5ecb1aace3424908d6d0d7cc7602b943deb9bf7858733fe9541c402e46","schema_version":"1.0","event_id":"sha256:783dcf5ecb1aace3424908d6d0d7cc7602b943deb9bf7858733fe9541c402e46"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:ZAH67EVVBNVQG3I2Y3WJGTMM3T","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Evaluating and Calibrating LLM Confidence on Questions with Multiple Correct Answers","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Keping Bi, Shiyu Ni, Yuanzi Li, Yuhan Wang, Zhikai Ding, Zihang Zhan","submitted_at":"2026-02-08T07:03:27Z","abstract_excerpt":"Confidence calibration is essential for making large language models (LLMs) reliable, yet existing training-free methods have been primarily studied under single-answer question answering. In this paper, we show that these methods break down in the presence of multiple valid answers, where disagreement among equally correct responses leads to systematic underestimation of confidence. To enable a systematic study of this phenomenon, we introduce MACE, a benchmark of 12,000 factual questions spanning six domains with varying numbers of correct answers. Experiments across 15 representative calibr"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.07842","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.07842/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-03T01:05:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RivN2xFUIdicQEKKJW4dXe3nTTtphd3nFdRG79vVQre/xoCj/VOodtahE8LCBitTRIzQ0HxBVBR56MmnUbpwBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T12:55:11.664393Z"},"content_sha256":"5ad81645ebf97e0be7115c94a506c2a7dc35434a76145281f2a01922a8c642a7","schema_version":"1.0","event_id":"sha256:5ad81645ebf97e0be7115c94a506c2a7dc35434a76145281f2a01922a8c642a7"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T/bundle.json","state_url":"https://pith.science/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T12:55:11Z","links":{"resolver":"https://pith.science/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T","bundle":"https://pith.science/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T/bundle.json","state":"https://pith.science/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ZAH67EVVBNVQG3I2Y3WJGTMM3T/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:ZAH67EVVBNVQG3I2Y3WJGTMM3T","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"84e1214712268694e6c33d4355c2019920243136bc809e0c248c758a2c63361d","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-08T07:03:27Z","title_canon_sha256":"61c967da27cc1f80d51e4c6176ca6fab01fab3d7ae66c3193f1b864fafc1a7f2"},"schema_version":"1.0","source":{"id":"2602.07842","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.07842","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"arxiv_version","alias_value":"2602.07842v2","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.07842","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"pith_short_12","alias_value":"ZAH67EVVBNVQ","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"pith_short_16","alias_value":"ZAH67EVVBNVQG3I2","created_at":"2026-06-03T01:05:48Z"},{"alias_kind":"pith_short_8","alias_value":"ZAH67EVV","created_at":"2026-06-03T01:05:48Z"}],"graph_snapshots":[{"event_id":"sha256:5ad81645ebf97e0be7115c94a506c2a7dc35434a76145281f2a01922a8c642a7","target":"graph","created_at":"2026-06-03T01:05:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.07842/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Confidence calibration is essential for making large language models (LLMs) reliable, yet existing training-free methods have been primarily studied under single-answer question answering. In this paper, we show that these methods break down in the presence of multiple valid answers, where disagreement among equally correct responses leads to systematic underestimation of confidence. To enable a systematic study of this phenomenon, we introduce MACE, a benchmark of 12,000 factual questions spanning six domains with varying numbers of correct answers. Experiments across 15 representative calibr","authors_text":"Keping Bi, Shiyu Ni, Yuanzi Li, Yuhan Wang, Zhikai Ding, Zihang Zhan","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-08T07:03:27Z","title":"Evaluating and Calibrating LLM Confidence on Questions with Multiple Correct Answers"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.07842","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:783dcf5ecb1aace3424908d6d0d7cc7602b943deb9bf7858733fe9541c402e46","target":"record","created_at":"2026-06-03T01:05:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"84e1214712268694e6c33d4355c2019920243136bc809e0c248c758a2c63361d","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-02-08T07:03:27Z","title_canon_sha256":"61c967da27cc1f80d51e4c6176ca6fab01fab3d7ae66c3193f1b864fafc1a7f2"},"schema_version":"1.0","source":{"id":"2602.07842","kind":"arxiv","version":2}},"canonical_sha256":"c80fef92b50b6b036d1ac6ec934d8cdcf3c2865a851d4ba84919485408707cc4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c80fef92b50b6b036d1ac6ec934d8cdcf3c2865a851d4ba84919485408707cc4","first_computed_at":"2026-06-03T01:05:48.756949Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-03T01:05:48.756949Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"uPju/eXrgRjJqbABy3a/xrQmMIBqrybjIz3quBypDx2VY94EqRNaRiS13W+6/LrMVYPm8R+4ln0ciHWcoSh7Dw==","signature_status":"signed_v1","signed_at":"2026-06-03T01:05:48.757412Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.07842","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:783dcf5ecb1aace3424908d6d0d7cc7602b943deb9bf7858733fe9541c402e46","sha256:5ad81645ebf97e0be7115c94a506c2a7dc35434a76145281f2a01922a8c642a7"],"state_sha256":"5697220d2e9e4c53bb22a672539df98759e17db0c4e7db04026b133d23f628ce"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"aa6nS9QKeGUxGSyIycRqD/98AJj7vlvOdY1ShIo2z2x2gMJSd5y/l6Pe2pqDvYmhPfaRH6gL0YOvxRmQixFuCg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T12:55:11.666400Z","bundle_sha256":"021fc6415670afbe2b5ec1afa06a974690665a07656baec6c26ffd5070522a5d"}}