{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:7FYDCXWRBXKB3MSY7GIA7BGMXW","short_pith_number":"pith:7FYDCXWR","schema_version":"1.0","canonical_sha256":"f970315ed10dd41db258f9900f84ccbd9e290e890486f42fa5823c0dd9a88ed1","source":{"kind":"arxiv","id":"2605.23909","version":1},"attestation_state":"computed","paper":{"title":"Confidence Calibration in Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Daniel BenShushan, Don A. Moore, Jacob Bien, Noam Michael","submitted_at":"2026-04-03T19:43:24Z","abstract_excerpt":"We investigate the calibration of large language models' (LLMs') confidence across diverse tasks. The results of our preregistered study show that the current crop of LLMs are, like people, too sure they are right: confidence exceeds accuracy, on average. Importantly, however, this tendency is moderated by a powerful hard-easy effect, wherein overconfidence is greatest on difficult tests; by contrast, easy tests actually show substantial underconfidence. We develop LifeEval, a test for evaluating model calibration across levels of difficulty."},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.23909","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-04-03T19:43:24Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"11a8a196c175bebad7f29e09565cb09381c9362413b2dfc7edcb60311fe72d3c","abstract_canon_sha256":"191f9eef31d5a54fa43249b501963466ef25a5d8815676fced27ff652aecf541"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T00:02:21.299943Z","signature_b64":"seHVBrPGE60fOwRQKnRxmnT8oE+gIUYpqMjlrFAD/WOYkMCqrsDI23NRFi/Wc8HX8vU/6Cu9vO3qshnE4O9QCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f970315ed10dd41db258f9900f84ccbd9e290e890486f42fa5823c0dd9a88ed1","last_reissued_at":"2026-05-26T00:02:21.298940Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T00:02:21.298940Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Confidence Calibration in Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Daniel BenShushan, Don A. Moore, Jacob Bien, Noam Michael","submitted_at":"2026-04-03T19:43:24Z","abstract_excerpt":"We investigate the calibration of large language models' (LLMs') confidence across diverse tasks. The results of our preregistered study show that the current crop of LLMs are, like people, too sure they are right: confidence exceeds accuracy, on average. Importantly, however, this tendency is moderated by a powerful hard-easy effect, wherein overconfidence is greatest on difficult tests; by contrast, easy tests actually show substantial underconfidence. We develop LifeEval, a test for evaluating model calibration across levels of difficulty."},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.23909","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.23909/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.23909","created_at":"2026-05-26T00:02:21.299083+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.23909v1","created_at":"2026-05-26T00:02:21.299083+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.23909","created_at":"2026-05-26T00:02:21.299083+00:00"},{"alias_kind":"pith_short_12","alias_value":"7FYDCXWRBXKB","created_at":"2026-05-26T00:02:21.299083+00:00"},{"alias_kind":"pith_short_16","alias_value":"7FYDCXWRBXKB3MSY","created_at":"2026-05-26T00:02:21.299083+00:00"},{"alias_kind":"pith_short_8","alias_value":"7FYDCXWR","created_at":"2026-05-26T00:02:21.299083+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW","json":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW.json","graph_json":"https://pith.science/api/pith-number/7FYDCXWRBXKB3MSY7GIA7BGMXW/graph.json","events_json":"https://pith.science/api/pith-number/7FYDCXWRBXKB3MSY7GIA7BGMXW/events.json","paper":"https://pith.science/paper/7FYDCXWR"},"agent_actions":{"view_html":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW","download_json":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW.json","view_paper":"https://pith.science/paper/7FYDCXWR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.23909&json=true","fetch_graph":"https://pith.science/api/pith-number/7FYDCXWRBXKB3MSY7GIA7BGMXW/graph.json","fetch_events":"https://pith.science/api/pith-number/7FYDCXWRBXKB3MSY7GIA7BGMXW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW/action/storage_attestation","attest_author":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW/action/author_attestation","sign_citation":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW/action/citation_signature","submit_replication":"https://pith.science/pith/7FYDCXWRBXKB3MSY7GIA7BGMXW/action/replication_record"}},"created_at":"2026-05-26T00:02:21.299083+00:00","updated_at":"2026-05-26T00:02:21.299083+00:00"}