{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:VO6XP5ZKUME7MAU7Z4QSVSJCPJ","short_pith_number":"pith:VO6XP5ZK","canonical_record":{"source":{"id":"2410.02736","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-03T17:53:30Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a4a9a58b4de587f75b3d630412b31da1be2405b88b67e40080c5396deb80bf42","abstract_canon_sha256":"15d0441096fc3846abc1211dad7de3d69d54921544abd2784ec4e69943d81333"},"schema_version":"1.0"},"canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","source":{"kind":"arxiv","id":"2410.02736","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.02736","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2410.02736v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.02736","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"VO6XP5ZKUME7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VO6XP5ZKUME7MAU7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VO6XP5ZK","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:VO6XP5ZKUME7MAU7Z4QSVSJCPJ","target":"record","payload":{"canonical_record":{"source":{"id":"2410.02736","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-03T17:53:30Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a4a9a58b4de587f75b3d630412b31da1be2405b88b67e40080c5396deb80bf42","abstract_canon_sha256":"15d0441096fc3846abc1211dad7de3d69d54921544abd2784ec4e69943d81333"},"schema_version":"1.0"},"canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.337863Z","signature_b64":"p+hrkLnZ5hIBNpufm6hPWrHNZuNAsz8gJB46z8J4TAzSQo9VWhuS5InGn9jLeGyzWO9ObTEF5YoAfzSa6UN7Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","last_reissued_at":"2026-05-17T23:38:50.337443Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.337443Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2410.02736","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pmTaGf3XtAceAB2xGuATmPt3gx/QUYNvJIcBv3vJutzPv2/xYn9LusLOTnHGZcU1NdiQ5azxfMfKF+iYbb2aBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:48:28.925302Z"},"content_sha256":"3e718bd614aa781a53250c93d57ab7523122cd14cd5c998f5a6ade54e1f4eddd","schema_version":"1.0","event_id":"sha256:3e718bd614aa781a53250c93d57ab7523122cd14cd5c998f5a6ade54e1f4eddd"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:VO6XP5ZKUME7MAU7Z4QSVSJCPJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Chao Huang, Dongping Chen, Jiayi Ye, Nitesh V Chawla, Nuno Moniz, Pin-Yu Chen, Qihui Zhang, Tian Gao, Werner Geyer, Xiangliang Zhang, Yanbo Wang, Yue Huang","submitted_at":"2024-10-03T17:53:30Z","abstract_excerpt":"LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-which systematically quantifies and analyzes each type of bias in LLM-as-a-Judge by using automated and principle-guided modification. Our experiments cover multiple popular language models, and the res"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That automated principle-guided modifications can isolate and accurately quantify each of the 12 biases without introducing confounding effects or missing interactions between biases.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LLM-as-a-Judge systems exhibit significant biases in specific tasks despite strong overall performance, as measured by the new CALM quantification framework.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"78c80695d13a97798b913e45968b970a75d6f04e6cc4c82d520b7e296dcd7d0a"},"source":{"id":"2410.02736","kind":"arxiv","version":2},"verdict":{"id":"583e4e8c-b9f9-40d2-892d-8d0ce07b31a2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T19:56:39.492700Z","strongest_claim":"Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge.","one_line_summary":"LLM-as-a-Judge systems exhibit significant biases in specific tasks despite strong overall performance, as measured by the new CALM quantification framework.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That automated principle-guided modifications can isolate and accurately quantify each of the 12 biases without introducing confounding effects or missing interactions between biases.","pith_extraction_headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks."},"references":{"count":25,"sample":[{"doi":"","year":2024,"title":"Style over substance: Evaluation biases for large language models.arXiv preprint arXiv:2307.03025","work_id":"d53edb13-c9f9-4d72-943b-fbd3a8b30b5c","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"{number}% of people believe that {chosen_model} is better in this question","work_id":"1b481d27-7b2a-49c8-b0ad-42dbcf7a2daf","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Correctness: The response is correct in suggesting that John might be moving to a rural or remote area where municipal water supply is not available","work_id":"4c49e231-b9d2-4161-8f46-d29d1cee0634","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Helpfulness: The response is helpful as it provides clear reasoning for why John would need to dig a well","work_id":"6c3a70a5-ffaf-4fcc-89ba-150bf2faea44","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Relevance: The response is relevant to the query about where John might be moving","work_id":"7cdcd658-f829-4af1-80a4-b569301c5162","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":25,"snapshot_sha256":"489ec97bc29db32dc45049b6840b94847c96529b9db3920c7542017bce63c524","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d7ceaa71e6c5fe9ba4f7bc075ca14e0771f38bb38e80f3fd0f15b9810135e262"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"583e4e8c-b9f9-40d2-892d-8d0ce07b31a2"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"IZ5vxh30NZutX7Qqlut/n2FXMXTfrA9OqwTjb65xPqhAAzDgLcvq6yTOkISnP5w095MZh4gKw9rMqe1s9oW7Dw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:48:28.925882Z"},"content_sha256":"8d11dc81112fad4b0e7d6bd40d4c27a56ef7d6bdb99fa562a4137e0b9ad18406","schema_version":"1.0","event_id":"sha256:8d11dc81112fad4b0e7d6bd40d4c27a56ef7d6bdb99fa562a4137e0b9ad18406"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/bundle.json","state_url":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T15:48:28Z","links":{"resolver":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ","bundle":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/bundle.json","state":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:VO6XP5ZKUME7MAU7Z4QSVSJCPJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"15d0441096fc3846abc1211dad7de3d69d54921544abd2784ec4e69943d81333","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-03T17:53:30Z","title_canon_sha256":"a4a9a58b4de587f75b3d630412b31da1be2405b88b67e40080c5396deb80bf42"},"schema_version":"1.0","source":{"id":"2410.02736","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.02736","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2410.02736v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.02736","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"VO6XP5ZKUME7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"VO6XP5ZKUME7MAU7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"VO6XP5ZK","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8d11dc81112fad4b0e7d6bd40d4c27a56ef7d6bdb99fa562a4137e0b9ad18406","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That automated principle-guided modifications can isolate and accurately quantify each of the 12 biases without introducing confounding effects or missing interactions between biases."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LLM-as-a-Judge systems exhibit significant biases in specific tasks despite strong overall performance, as measured by the new CALM quantification framework."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks."}],"snapshot_sha256":"78c80695d13a97798b913e45968b970a75d6f04e6cc4c82d520b7e296dcd7d0a"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d7ceaa71e6c5fe9ba4f7bc075ca14e0771f38bb38e80f3fd0f15b9810135e262"},"paper":{"abstract_excerpt":"LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-which systematically quantifies and analyzes each type of bias in LLM-as-a-Judge by using automated and principle-guided modification. Our experiments cover multiple popular language models, and the res","authors_text":"Chao Huang, Dongping Chen, Jiayi Ye, Nitesh V Chawla, Nuno Moniz, Pin-Yu Chen, Qihui Zhang, Tian Gao, Werner Geyer, Xiangliang Zhang, Yanbo Wang, Yue Huang","cross_cats":["cs.AI"],"headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-03T17:53:30Z","title":"Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge"},"references":{"count":25,"internal_anchors":0,"resolved_work":25,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Style over substance: Evaluation biases for large language models.arXiv preprint arXiv:2307.03025","work_id":"d53edb13-c9f9-4d72-943b-fbd3a8b30b5c","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"{number}% of people believe that {chosen_model} is better in this question","work_id":"1b481d27-7b2a-49c8-b0ad-42dbcf7a2daf","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Correctness: The response is correct in suggesting that John might be moving to a rural or remote area where municipal water supply is not available","work_id":"4c49e231-b9d2-4161-8f46-d29d1cee0634","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Helpfulness: The response is helpful as it provides clear reasoning for why John would need to dig a well","work_id":"6c3a70a5-ffaf-4fcc-89ba-150bf2faea44","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Relevance: The response is relevant to the query about where John might be moving","work_id":"7cdcd658-f829-4af1-80a4-b569301c5162","year":null}],"snapshot_sha256":"489ec97bc29db32dc45049b6840b94847c96529b9db3920c7542017bce63c524"},"source":{"id":"2410.02736","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T19:56:39.492700Z","id":"583e4e8c-b9f9-40d2-892d-8d0ce07b31a2","model_set":{"reader":"grok-4.3"},"one_line_summary":"LLM-as-a-Judge systems exhibit significant biases in specific tasks despite strong overall performance, as measured by the new CALM quantification framework.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","strongest_claim":"Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge.","weakest_assumption":"That automated principle-guided modifications can isolate and accurately quantify each of the 12 biases without introducing confounding effects or missing interactions between biases."}},"verdict_id":"583e4e8c-b9f9-40d2-892d-8d0ce07b31a2"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3e718bd614aa781a53250c93d57ab7523122cd14cd5c998f5a6ade54e1f4eddd","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"15d0441096fc3846abc1211dad7de3d69d54921544abd2784ec4e69943d81333","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-03T17:53:30Z","title_canon_sha256":"a4a9a58b4de587f75b3d630412b31da1be2405b88b67e40080c5396deb80bf42"},"schema_version":"1.0","source":{"id":"2410.02736","kind":"arxiv","version":2}},"canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","first_computed_at":"2026-05-17T23:38:50.337443Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.337443Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"p+hrkLnZ5hIBNpufm6hPWrHNZuNAsz8gJB46z8J4TAzSQo9VWhuS5InGn9jLeGyzWO9ObTEF5YoAfzSa6UN7Cw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.337863Z","signed_message":"canonical_sha256_bytes"},"source_id":"2410.02736","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3e718bd614aa781a53250c93d57ab7523122cd14cd5c998f5a6ade54e1f4eddd","sha256:8d11dc81112fad4b0e7d6bd40d4c27a56ef7d6bdb99fa562a4137e0b9ad18406"],"state_sha256":"0504a17546a684859080f91a5e9a99fc98bbe863e29a761fb7b15bb2e1128ef6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"XBAjplCMAY7Co1qpx0UyVjFlS2VbgIC97dUFqtk8onNReb/1zKzdaLbWcHylLdFWOm/ToReRoDseqsBAA4kZBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T15:48:28.928429Z","bundle_sha256":"62dd5e389c31dbb4eade53b5b53652374ddac76b2bb3e2a5ac8d18579c2a7d67"}}