{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:4QDH3RXS6K6BRONFMLW55BADV3","short_pith_number":"pith:4QDH3RXS","schema_version":"1.0","canonical_sha256":"e4067dc6f2f2bc18b9a562edde8403aed810890997d83da2d1c2eecf562687f2","source":{"kind":"arxiv","id":"2605.16354","version":1},"attestation_state":"computed","paper":{"title":"Augmenting Human Evaluation with LLM Judges: How Many Human Reviews Do You Need?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.HC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Jane Paik Kim","submitted_at":"2026-05-08T17:13:08Z","abstract_excerpt":"Large language models (LLMs) are increasingly used as automated evaluators of AI systems, including in high-stakes applications. In this role, LLMs are used to generate judgments about the quality, appropriateness, or even safety of model outputs. This approach is motivated by practical constraints. Expert human ratings are costly and difficult to scale, whereas LLM ratings can be produced quickly at low cost. However, current approaches to deploying LLM evaluators are ad hoc, typically limited to reporting agreement metrics between human and LLM judges as a justification for substitution of h"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.16354","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-08T17:13:08Z","cross_cats_sorted":["cs.AI","cs.CL","cs.HC","stat.ML"],"title_canon_sha256":"4ed472f1e78b563a2eec4e24e660dd71704aae7aa4963c77e6e6262b9f435c4e","abstract_canon_sha256":"efbd1d83cabd374e95f9ae31d7ab76f10646bd1986c75d7647d44b7866274104"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:18.036893Z","signature_b64":"8pLEc7ABuVntjwS/TL0dla77AFJ6pcxFAUSngQhYnXBAVKEegFe8E7SXbUleWTt81Emf2ljCUQbBx9x6xR24CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e4067dc6f2f2bc18b9a562edde8403aed810890997d83da2d1c2eecf562687f2","last_reissued_at":"2026-05-20T00:02:18.036222Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:18.036222Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Augmenting Human Evaluation with LLM Judges: How Many Human Reviews Do You Need?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.HC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Jane Paik Kim","submitted_at":"2026-05-08T17:13:08Z","abstract_excerpt":"Large language models (LLMs) are increasingly used as automated evaluators of AI systems, including in high-stakes applications. In this role, LLMs are used to generate judgments about the quality, appropriateness, or even safety of model outputs. This approach is motivated by practical constraints. Expert human ratings are costly and difficult to scale, whereas LLM ratings can be produced quickly at low cost. However, current approaches to deploying LLM evaluators are ad hoc, typically limited to reporting agreement metrics between human and LLM judges as a justification for substitution of h"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.16354","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.16354/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.16354","created_at":"2026-05-20T00:02:18.036330+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.16354v1","created_at":"2026-05-20T00:02:18.036330+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16354","created_at":"2026-05-20T00:02:18.036330+00:00"},{"alias_kind":"pith_short_12","alias_value":"4QDH3RXS6K6B","created_at":"2026-05-20T00:02:18.036330+00:00"},{"alias_kind":"pith_short_16","alias_value":"4QDH3RXS6K6BRONF","created_at":"2026-05-20T00:02:18.036330+00:00"},{"alias_kind":"pith_short_8","alias_value":"4QDH3RXS","created_at":"2026-05-20T00:02:18.036330+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3","json":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3.json","graph_json":"https://pith.science/api/pith-number/4QDH3RXS6K6BRONFMLW55BADV3/graph.json","events_json":"https://pith.science/api/pith-number/4QDH3RXS6K6BRONFMLW55BADV3/events.json","paper":"https://pith.science/paper/4QDH3RXS"},"agent_actions":{"view_html":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3","download_json":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3.json","view_paper":"https://pith.science/paper/4QDH3RXS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.16354&json=true","fetch_graph":"https://pith.science/api/pith-number/4QDH3RXS6K6BRONFMLW55BADV3/graph.json","fetch_events":"https://pith.science/api/pith-number/4QDH3RXS6K6BRONFMLW55BADV3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3/action/storage_attestation","attest_author":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3/action/author_attestation","sign_citation":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3/action/citation_signature","submit_replication":"https://pith.science/pith/4QDH3RXS6K6BRONFMLW55BADV3/action/replication_record"}},"created_at":"2026-05-20T00:02:18.036330+00:00","updated_at":"2026-05-20T00:02:18.036330+00:00"}