{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:7WLALX5EJFKBZZFB2QPDXEOKZ6","short_pith_number":"pith:7WLALX5E","schema_version":"1.0","canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","source":{"kind":"arxiv","id":"2602.00521","version":2},"attestation_state":"computed","paper":{"title":"Diagnosing the Reliability of LLM-as-a-Judge via Item Response Theory","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bugeun Kim, Chanhee Cho, Hyeonchu Park, Junhyuk Choi, Sohhyung Park","submitted_at":"2026-01-31T05:24:08Z","abstract_excerpt":"While LLM-as-a-Judge is widely used in automated evaluation, existing validation practices primarily operate at the level of observed outputs, offering limited insight into whether LLM judges themselves function as stable and reliable measurement instruments. To address this limitation, we introduce a two-phase diagnostic framework for assessing reliability of LLM-as-a-Judge, grounded in Item Response Theory (IRT). The framework adopts Graded Response Model (GRM) of IRT and formalizes reliability along two complementary dimensions: (1) intrinsic consistency, defined as the stability of measure"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.00521","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-31T05:24:08Z","cross_cats_sorted":[],"title_canon_sha256":"7ea628566fe52c7b76fb8256377fee60b95bdb77f874a038b367967b76a1c4d0","abstract_canon_sha256":"a5d5c82e118324b92658849a459779dad9eecc5dfefe553f66f6d97903c7f5f5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-01T01:03:49.407180Z","signature_b64":"LXpmowaWuSnX02/ctTqGt4Z4fAVYSENxXnfPNX2JW840o2HSwN/ky3Go6YSTpjfNGfC9lA8U8EzV/QBPjkI2Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","last_reissued_at":"2026-06-01T01:03:49.406251Z","signature_status":"signed_v1","first_computed_at":"2026-06-01T01:03:49.406251Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Diagnosing the Reliability of LLM-as-a-Judge via Item Response Theory","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bugeun Kim, Chanhee Cho, Hyeonchu Park, Junhyuk Choi, Sohhyung Park","submitted_at":"2026-01-31T05:24:08Z","abstract_excerpt":"While LLM-as-a-Judge is widely used in automated evaluation, existing validation practices primarily operate at the level of observed outputs, offering limited insight into whether LLM judges themselves function as stable and reliable measurement instruments. To address this limitation, we introduce a two-phase diagnostic framework for assessing reliability of LLM-as-a-Judge, grounded in Item Response Theory (IRT). The framework adopts Graded Response Model (GRM) of IRT and formalizes reliability along two complementary dimensions: (1) intrinsic consistency, defined as the stability of measure"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.00521","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.00521/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.00521","created_at":"2026-06-01T01:03:49.406402+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.00521v2","created_at":"2026-06-01T01:03:49.406402+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.00521","created_at":"2026-06-01T01:03:49.406402+00:00"},{"alias_kind":"pith_short_12","alias_value":"7WLALX5EJFKB","created_at":"2026-06-01T01:03:49.406402+00:00"},{"alias_kind":"pith_short_16","alias_value":"7WLALX5EJFKBZZFB","created_at":"2026-06-01T01:03:49.406402+00:00"},{"alias_kind":"pith_short_8","alias_value":"7WLALX5E","created_at":"2026-06-01T01:03:49.406402+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.00238","citing_title":"Estimating LLM Grading Ability and Response Difficulty in Automatic Short Answer Grading via Item Response Theory","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00238","citing_title":"Estimating LLM Grading Ability and Response Difficulty in Automatic Short Answer Grading via Item Response Theory","ref_index":27,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6","json":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6.json","graph_json":"https://pith.science/api/pith-number/7WLALX5EJFKBZZFB2QPDXEOKZ6/graph.json","events_json":"https://pith.science/api/pith-number/7WLALX5EJFKBZZFB2QPDXEOKZ6/events.json","paper":"https://pith.science/paper/7WLALX5E"},"agent_actions":{"view_html":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6","download_json":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6.json","view_paper":"https://pith.science/paper/7WLALX5E","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.00521&json=true","fetch_graph":"https://pith.science/api/pith-number/7WLALX5EJFKBZZFB2QPDXEOKZ6/graph.json","fetch_events":"https://pith.science/api/pith-number/7WLALX5EJFKBZZFB2QPDXEOKZ6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/action/storage_attestation","attest_author":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/action/author_attestation","sign_citation":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/action/citation_signature","submit_replication":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/action/replication_record"}},"created_at":"2026-06-01T01:03:49.406402+00:00","updated_at":"2026-06-01T01:03:49.406402+00:00"}