{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:4C34I5XWSL443R5PP5CYBASQMH","short_pith_number":"pith:4C34I5XW","schema_version":"1.0","canonical_sha256":"e0b7c476f692f9cdc7af7f4580825061e086b55df9c36e965ce825b047944dbf","source":{"kind":"arxiv","id":"2511.21140","version":4},"attestation_state":"computed","paper":{"title":"How to Correctly Report LLM-as-a-Judge Evaluations","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.CL","stat.AP","stat.ML"],"primary_cat":"cs.LG","authors_text":"Chungpa Lee, Jongwon Jeong, Jy-yong Sohn, Kangwook Lee, Thomas Zeng","submitted_at":"2025-11-26T07:46:46Z","abstract_excerpt":"Large language models (LLMs) are widely used as scalable evaluators of model responses in lieu of human annotators. However, imperfect sensitivity and specificity of the LLM judges induce bias in naive evaluation scores. We propose a simple plug-in framework that corrects this bias and enables statistically principled uncertainty quantification. Our framework constructs confidence intervals that account for uncertainty from both the test dataset and a human-labeled calibration dataset. Additionally, it uses an adaptive strategy to allocate calibration samples for tighter intervals. Importantly"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.21140","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2025-11-26T07:46:46Z","cross_cats_sorted":["cs.CL","stat.AP","stat.ML"],"title_canon_sha256":"ea87b5df064e148c836da02f166b202ed4063dfc3970caae07b40a56ab6ad49d","abstract_canon_sha256":"e497e5d519af181149c20a00b1391ed3eddf11602f1e8e9101c70cff5d032a1b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:11.568752Z","signature_b64":"yLvu0hzTk5gPUEmwSC61GbmpDAnGksqKartie7PjdILnQxxKSLGZJ84Buj/FrcV+UQOyCQmzcWVstQ++jelDBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e0b7c476f692f9cdc7af7f4580825061e086b55df9c36e965ce825b047944dbf","last_reissued_at":"2026-06-02T02:04:11.568236Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:11.568236Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How to Correctly Report LLM-as-a-Judge Evaluations","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.CL","stat.AP","stat.ML"],"primary_cat":"cs.LG","authors_text":"Chungpa Lee, Jongwon Jeong, Jy-yong Sohn, Kangwook Lee, Thomas Zeng","submitted_at":"2025-11-26T07:46:46Z","abstract_excerpt":"Large language models (LLMs) are widely used as scalable evaluators of model responses in lieu of human annotators. However, imperfect sensitivity and specificity of the LLM judges induce bias in naive evaluation scores. We propose a simple plug-in framework that corrects this bias and enables statistically principled uncertainty quantification. Our framework constructs confidence intervals that account for uncertainty from both the test dataset and a human-labeled calibration dataset. Additionally, it uses an adaptive strategy to allocate calibration samples for tighter intervals. Importantly"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.21140","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2511.21140/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.21140","created_at":"2026-06-02T02:04:11.568298+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.21140v4","created_at":"2026-06-02T02:04:11.568298+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.21140","created_at":"2026-06-02T02:04:11.568298+00:00"},{"alias_kind":"pith_short_12","alias_value":"4C34I5XWSL44","created_at":"2026-06-02T02:04:11.568298+00:00"},{"alias_kind":"pith_short_16","alias_value":"4C34I5XWSL443R5P","created_at":"2026-06-02T02:04:11.568298+00:00"},{"alias_kind":"pith_short_8","alias_value":"4C34I5XW","created_at":"2026-06-02T02:04:11.568298+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.23362","citing_title":"Instance-Optimal Estimation with Multiple LLM Judges on a Budget","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2512.05958","citing_title":"MaxShapley: Towards Incentive-compatible Generative Search with Fair Context Attribution","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23505","citing_title":"Uncertainty Propagation in LLM-Based Systems","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22937","citing_title":"AutoPyVerifier: Learning Compact Executable Verifiers for Large Language Model Outputs","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06939","citing_title":"Bias and Uncertainty in LLM-as-a-Judge Estimation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07572","citing_title":"Open-Ended Task Discovery via Bayesian Optimization","ref_index":41,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH","json":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH.json","graph_json":"https://pith.science/api/pith-number/4C34I5XWSL443R5PP5CYBASQMH/graph.json","events_json":"https://pith.science/api/pith-number/4C34I5XWSL443R5PP5CYBASQMH/events.json","paper":"https://pith.science/paper/4C34I5XW"},"agent_actions":{"view_html":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH","download_json":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH.json","view_paper":"https://pith.science/paper/4C34I5XW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.21140&json=true","fetch_graph":"https://pith.science/api/pith-number/4C34I5XWSL443R5PP5CYBASQMH/graph.json","fetch_events":"https://pith.science/api/pith-number/4C34I5XWSL443R5PP5CYBASQMH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH/action/storage_attestation","attest_author":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH/action/author_attestation","sign_citation":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH/action/citation_signature","submit_replication":"https://pith.science/pith/4C34I5XWSL443R5PP5CYBASQMH/action/replication_record"}},"created_at":"2026-06-02T02:04:11.568298+00:00","updated_at":"2026-06-02T02:04:11.568298+00:00"}