{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:7WLALX5EJFKBZZFB2QPDXEOKZ6","short_pith_number":"pith:7WLALX5E","canonical_record":{"source":{"id":"2602.00521","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-31T05:24:08Z","cross_cats_sorted":[],"title_canon_sha256":"7ea628566fe52c7b76fb8256377fee60b95bdb77f874a038b367967b76a1c4d0","abstract_canon_sha256":"a5d5c82e118324b92658849a459779dad9eecc5dfefe553f66f6d97903c7f5f5"},"schema_version":"1.0"},"canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","source":{"kind":"arxiv","id":"2602.00521","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.00521","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"arxiv_version","alias_value":"2602.00521v2","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.00521","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"pith_short_12","alias_value":"7WLALX5EJFKB","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"pith_short_16","alias_value":"7WLALX5EJFKBZZFB","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"pith_short_8","alias_value":"7WLALX5E","created_at":"2026-06-01T01:03:49Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:7WLALX5EJFKBZZFB2QPDXEOKZ6","target":"record","payload":{"canonical_record":{"source":{"id":"2602.00521","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-31T05:24:08Z","cross_cats_sorted":[],"title_canon_sha256":"7ea628566fe52c7b76fb8256377fee60b95bdb77f874a038b367967b76a1c4d0","abstract_canon_sha256":"a5d5c82e118324b92658849a459779dad9eecc5dfefe553f66f6d97903c7f5f5"},"schema_version":"1.0"},"canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-01T01:03:49.407180Z","signature_b64":"LXpmowaWuSnX02/ctTqGt4Z4fAVYSENxXnfPNX2JW840o2HSwN/ky3Go6YSTpjfNGfC9lA8U8EzV/QBPjkI2Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","last_reissued_at":"2026-06-01T01:03:49.406251Z","signature_status":"signed_v1","first_computed_at":"2026-06-01T01:03:49.406251Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.00521","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-01T01:03:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"AvtEAAC9EeGu/L0xQ2fKVTkYkqUhIeFHRtPepZY+nFQ7cmosR0xyEYkjOxTq1R72ok7p78n9etcfEuj8zFk1DA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T07:12:39.606968Z"},"content_sha256":"f429d26cde9ce69d571958b0bb00c3f0c42a424ae856c382ebf92ed03ac3237f","schema_version":"1.0","event_id":"sha256:f429d26cde9ce69d571958b0bb00c3f0c42a424ae856c382ebf92ed03ac3237f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:7WLALX5EJFKBZZFB2QPDXEOKZ6","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Diagnosing the Reliability of LLM-as-a-Judge via Item Response Theory","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Bugeun Kim, Chanhee Cho, Hyeonchu Park, Junhyuk Choi, Sohhyung Park","submitted_at":"2026-01-31T05:24:08Z","abstract_excerpt":"While LLM-as-a-Judge is widely used in automated evaluation, existing validation practices primarily operate at the level of observed outputs, offering limited insight into whether LLM judges themselves function as stable and reliable measurement instruments. To address this limitation, we introduce a two-phase diagnostic framework for assessing reliability of LLM-as-a-Judge, grounded in Item Response Theory (IRT). The framework adopts Graded Response Model (GRM) of IRT and formalizes reliability along two complementary dimensions: (1) intrinsic consistency, defined as the stability of measure"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.00521","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.00521/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-01T01:03:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"801d0DVCAcld+ySrQzWikwiW/o3/pjMzcl8ZuxwYk+dhYAVpLxBGw07gHBsw1KDU1iHBqRiNHdNbScJMnUR7AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-10T07:12:39.607705Z"},"content_sha256":"ef6fce91c121d90fdde46e8b66cac925dcd1e8f769d7c78ca3b3fe31fdcb9ca8","schema_version":"1.0","event_id":"sha256:ef6fce91c121d90fdde46e8b66cac925dcd1e8f769d7c78ca3b3fe31fdcb9ca8"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/bundle.json","state_url":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-10T07:12:39Z","links":{"resolver":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6","bundle":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/bundle.json","state":"https://pith.science/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7WLALX5EJFKBZZFB2QPDXEOKZ6/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:7WLALX5EJFKBZZFB2QPDXEOKZ6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a5d5c82e118324b92658849a459779dad9eecc5dfefe553f66f6d97903c7f5f5","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-31T05:24:08Z","title_canon_sha256":"7ea628566fe52c7b76fb8256377fee60b95bdb77f874a038b367967b76a1c4d0"},"schema_version":"1.0","source":{"id":"2602.00521","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.00521","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"arxiv_version","alias_value":"2602.00521v2","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.00521","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"pith_short_12","alias_value":"7WLALX5EJFKB","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"pith_short_16","alias_value":"7WLALX5EJFKBZZFB","created_at":"2026-06-01T01:03:49Z"},{"alias_kind":"pith_short_8","alias_value":"7WLALX5E","created_at":"2026-06-01T01:03:49Z"}],"graph_snapshots":[{"event_id":"sha256:ef6fce91c121d90fdde46e8b66cac925dcd1e8f769d7c78ca3b3fe31fdcb9ca8","target":"graph","created_at":"2026-06-01T01:03:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.00521/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"While LLM-as-a-Judge is widely used in automated evaluation, existing validation practices primarily operate at the level of observed outputs, offering limited insight into whether LLM judges themselves function as stable and reliable measurement instruments. To address this limitation, we introduce a two-phase diagnostic framework for assessing reliability of LLM-as-a-Judge, grounded in Item Response Theory (IRT). The framework adopts Graded Response Model (GRM) of IRT and formalizes reliability along two complementary dimensions: (1) intrinsic consistency, defined as the stability of measure","authors_text":"Bugeun Kim, Chanhee Cho, Hyeonchu Park, Junhyuk Choi, Sohhyung Park","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-31T05:24:08Z","title":"Diagnosing the Reliability of LLM-as-a-Judge via Item Response Theory"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.00521","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f429d26cde9ce69d571958b0bb00c3f0c42a424ae856c382ebf92ed03ac3237f","target":"record","created_at":"2026-06-01T01:03:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a5d5c82e118324b92658849a459779dad9eecc5dfefe553f66f6d97903c7f5f5","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-01-31T05:24:08Z","title_canon_sha256":"7ea628566fe52c7b76fb8256377fee60b95bdb77f874a038b367967b76a1c4d0"},"schema_version":"1.0","source":{"id":"2602.00521","kind":"arxiv","version":2}},"canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fd9605dfa449541ce4a1d41e3b91cacf94eea0c71bb9116fa4aac4dac3d67ddd","first_computed_at":"2026-06-01T01:03:49.406251Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-01T01:03:49.406251Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"LXpmowaWuSnX02/ctTqGt4Z4fAVYSENxXnfPNX2JW840o2HSwN/ky3Go6YSTpjfNGfC9lA8U8EzV/QBPjkI2Dg==","signature_status":"signed_v1","signed_at":"2026-06-01T01:03:49.407180Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.00521","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f429d26cde9ce69d571958b0bb00c3f0c42a424ae856c382ebf92ed03ac3237f","sha256:ef6fce91c121d90fdde46e8b66cac925dcd1e8f769d7c78ca3b3fe31fdcb9ca8"],"state_sha256":"4b172df51097e4abcc02ca942693b66b6cfc0223dd05511f5cadd382374d9f3b"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"MXWA0+tmB4MOQil345cAboTZTjpPghorv88BJttdpDuW3xRendRyY+0AUWx3mOyztbYlIU3wB+vyo6w0/AekBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-10T07:12:39.612341Z","bundle_sha256":"b3577c9df794f6f2cd71b5715d45691b570121816fcac0e0b27e49e836ac1e08"}}