{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:TYIDEY22PUNQR7JNV2HIFOIU77","short_pith_number":"pith:TYIDEY22","schema_version":"1.0","canonical_sha256":"9e1032635a7d1b08fd2dae8e82b914ffe2c17eceebc1af113705c9df467ebed1","source":{"kind":"arxiv","id":"2606.10296","version":1},"attestation_state":"computed","paper":{"title":"The Confident Liar: Diagnosing Multi-Agent Debate with Log-Probabilities and LLM-as-Judge","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Ali Keramati, Jacob Horne, Justin Cheok, Mark Warschauer","submitted_at":"2026-06-09T01:33:15Z","abstract_excerpt":"Multi-agent debate systems are typically evaluated only on whether the final answer is correct, overlooking the quality of the intermediate reasoning that debate is designed to produce. This paper studies the relationship between three signals in multi-agent debate: token-level log-probability distributions over reasoning tokens, LLM-as-judge rubric scores assigned to those tokens, and final task accuracy. We examine whether internal confidence signals predict externally evaluated reasoning quality, and whether either signal aligns with task correctness, across three domains: rubric-based scor"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.10296","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-09T01:33:15Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"ac381e24b08785357fc53a217fff622be0be19fc02e8a3dccf2a73e1fdb224b1","abstract_canon_sha256":"8afd24a189b4ae52990b9627a9d930f231fac759a2084f2f3404a872f7eecd66"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-10T01:10:10.242673Z","signature_b64":"/JMb8RcQmz4OOcj5AfAHZbgMatg1EIOiiRS0kZwBKsYsAe7t3PVZVVYNqllfqq9pLxWUIyGBVApDiIgJ8JTZAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9e1032635a7d1b08fd2dae8e82b914ffe2c17eceebc1af113705c9df467ebed1","last_reissued_at":"2026-06-10T01:10:10.241582Z","signature_status":"signed_v1","first_computed_at":"2026-06-10T01:10:10.241582Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The Confident Liar: Diagnosing Multi-Agent Debate with Log-Probabilities and LLM-as-Judge","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Ali Keramati, Jacob Horne, Justin Cheok, Mark Warschauer","submitted_at":"2026-06-09T01:33:15Z","abstract_excerpt":"Multi-agent debate systems are typically evaluated only on whether the final answer is correct, overlooking the quality of the intermediate reasoning that debate is designed to produce. This paper studies the relationship between three signals in multi-agent debate: token-level log-probability distributions over reasoning tokens, LLM-as-judge rubric scores assigned to those tokens, and final task accuracy. We examine whether internal confidence signals predict externally evaluated reasoning quality, and whether either signal aligns with task correctness, across three domains: rubric-based scor"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.10296","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.10296/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.10296","created_at":"2026-06-10T01:10:10.241743+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.10296v1","created_at":"2026-06-10T01:10:10.241743+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.10296","created_at":"2026-06-10T01:10:10.241743+00:00"},{"alias_kind":"pith_short_12","alias_value":"TYIDEY22PUNQ","created_at":"2026-06-10T01:10:10.241743+00:00"},{"alias_kind":"pith_short_16","alias_value":"TYIDEY22PUNQR7JN","created_at":"2026-06-10T01:10:10.241743+00:00"},{"alias_kind":"pith_short_8","alias_value":"TYIDEY22","created_at":"2026-06-10T01:10:10.241743+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77","json":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77.json","graph_json":"https://pith.science/api/pith-number/TYIDEY22PUNQR7JNV2HIFOIU77/graph.json","events_json":"https://pith.science/api/pith-number/TYIDEY22PUNQR7JNV2HIFOIU77/events.json","paper":"https://pith.science/paper/TYIDEY22"},"agent_actions":{"view_html":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77","download_json":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77.json","view_paper":"https://pith.science/paper/TYIDEY22","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.10296&json=true","fetch_graph":"https://pith.science/api/pith-number/TYIDEY22PUNQR7JNV2HIFOIU77/graph.json","fetch_events":"https://pith.science/api/pith-number/TYIDEY22PUNQR7JNV2HIFOIU77/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77/action/storage_attestation","attest_author":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77/action/author_attestation","sign_citation":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77/action/citation_signature","submit_replication":"https://pith.science/pith/TYIDEY22PUNQR7JNV2HIFOIU77/action/replication_record"}},"created_at":"2026-06-10T01:10:10.241743+00:00","updated_at":"2026-06-10T01:10:10.241743+00:00"}