{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:XRC2M3DMBNRNQLHUOCM66SZSDE","short_pith_number":"pith:XRC2M3DM","schema_version":"1.0","canonical_sha256":"bc45a66c6c0b62d82cf47099ef4b32191daee357b1472fb40a589f5b8a32c930","source":{"kind":"arxiv","id":"2507.16806","version":2},"attestation_state":"computed","paper":{"title":"Beyond Binary Rewards: Training LMs to Reason About Their Uncertainty","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Idan Shenfeld, Isha Puri, Jacob Andreas, Leshem Choshen, Mehul Damani, Stewart Slocum, Yoon Kim","submitted_at":"2025-07-22T17:56:01Z","abstract_excerpt":"When language models (LMs) are trained via reinforcement learning (RL) to generate natural language \"reasoning chains\", their performance improves on a variety of difficult question answering tasks. Today, almost all successful applications of RL for reasoning use binary reward functions that evaluate the correctness of LM outputs. Because such reward functions do not penalize guessing or low-confidence outputs, they often have the unintended side-effect of degrading calibration and increasing the rate at which LMs generate incorrect responses (or \"hallucinate\") in other problem domains. This "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2507.16806","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-07-22T17:56:01Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"68d141714159478bdb20aabdcdfe4f788198b8c7659ae00f2a48e36f3ca05749","abstract_canon_sha256":"7858d66cac17213bf2fb66ea392a6b71b0f0b00e1f4b2a19d7548d5f2281bc28"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:22.345593Z","signature_b64":"QDHO3gSOzBSSL80lLZeg1bSZ+ANYmgIfQTrTKoGcAbxIHDSPFScv6meqRo0gKxNtGtpAoPzio38mp7InNJgMBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bc45a66c6c0b62d82cf47099ef4b32191daee357b1472fb40a589f5b8a32c930","last_reissued_at":"2026-05-20T00:00:22.344833Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:22.344833Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Beyond Binary Rewards: Training LMs to Reason About Their Uncertainty","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Idan Shenfeld, Isha Puri, Jacob Andreas, Leshem Choshen, Mehul Damani, Stewart Slocum, Yoon Kim","submitted_at":"2025-07-22T17:56:01Z","abstract_excerpt":"When language models (LMs) are trained via reinforcement learning (RL) to generate natural language \"reasoning chains\", their performance improves on a variety of difficult question answering tasks. Today, almost all successful applications of RL for reasoning use binary reward functions that evaluate the correctness of LM outputs. Because such reward functions do not penalize guessing or low-confidence outputs, they often have the unintended side-effect of degrading calibration and increasing the rate at which LMs generate incorrect responses (or \"hallucinate\") in other problem domains. This "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2507.16806","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2507.16806/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2507.16806","created_at":"2026-05-20T00:00:22.344951+00:00"},{"alias_kind":"arxiv_version","alias_value":"2507.16806v2","created_at":"2026-05-20T00:00:22.344951+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.16806","created_at":"2026-05-20T00:00:22.344951+00:00"},{"alias_kind":"pith_short_12","alias_value":"XRC2M3DMBNRN","created_at":"2026-05-20T00:00:22.344951+00:00"},{"alias_kind":"pith_short_16","alias_value":"XRC2M3DMBNRNQLHU","created_at":"2026-05-20T00:00:22.344951+00:00"},{"alias_kind":"pith_short_8","alias_value":"XRC2M3DM","created_at":"2026-05-20T00:00:22.344951+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":9,"internal_anchor_count":9,"sample":[{"citing_arxiv_id":"2601.05905","citing_title":"Illusions of Confidence? Diagnosing LLM Truthfulness via Neighborhood Consistency","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2603.09117","citing_title":"Decoupling Reasoning and Confidence: Resurrecting Calibration in Reinforcement Learning from Verifiable Rewards","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06785","citing_title":"Distributional Process Reward Models: Calibrated Prediction of Future Rewards via Conditional Optimal Transport","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26152","citing_title":"AI Observability for Large Language Model Systems: A Multi-Layer Analysis of Monitoring Approaches from Confidence Calibration to Infrastructure Tracing","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23333","citing_title":"Process Supervision of Confidence Margin for Calibrated LLM Reasoning","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22110","citing_title":"Do Not Imitate, Reinforce: Iterative Classification via Belief Refinement","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12632","citing_title":"Calibration-Aware Policy Optimization for Reasoning LLMs","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06785","citing_title":"Distributional Process Reward Models: Calibrated Prediction of Future Rewards via Conditional Optimal Transport","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19444","citing_title":"Unsupervised Confidence Calibration for Reasoning LLMs from a Single Generation","ref_index":155,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE","json":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE.json","graph_json":"https://pith.science/api/pith-number/XRC2M3DMBNRNQLHUOCM66SZSDE/graph.json","events_json":"https://pith.science/api/pith-number/XRC2M3DMBNRNQLHUOCM66SZSDE/events.json","paper":"https://pith.science/paper/XRC2M3DM"},"agent_actions":{"view_html":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE","download_json":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE.json","view_paper":"https://pith.science/paper/XRC2M3DM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2507.16806&json=true","fetch_graph":"https://pith.science/api/pith-number/XRC2M3DMBNRNQLHUOCM66SZSDE/graph.json","fetch_events":"https://pith.science/api/pith-number/XRC2M3DMBNRNQLHUOCM66SZSDE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE/action/storage_attestation","attest_author":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE/action/author_attestation","sign_citation":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE/action/citation_signature","submit_replication":"https://pith.science/pith/XRC2M3DMBNRNQLHUOCM66SZSDE/action/replication_record"}},"created_at":"2026-05-20T00:00:22.344951+00:00","updated_at":"2026-05-20T00:00:22.344951+00:00"}