{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:VO6XP5ZKUME7MAU7Z4QSVSJCPJ","short_pith_number":"pith:VO6XP5ZK","schema_version":"1.0","canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","source":{"kind":"arxiv","id":"2410.02736","version":2},"attestation_state":"computed","paper":{"title":"Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Chao Huang, Dongping Chen, Jiayi Ye, Nitesh V Chawla, Nuno Moniz, Pin-Yu Chen, Qihui Zhang, Tian Gao, Werner Geyer, Xiangliang Zhang, Yanbo Wang, Yue Huang","submitted_at":"2024-10-03T17:53:30Z","abstract_excerpt":"LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-which systematically quantifies and analyzes each type of bias in LLM-as-a-Judge by using automated and principle-guided modification. Our experiments cover multiple popular language models, and the res"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2410.02736","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-03T17:53:30Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a4a9a58b4de587f75b3d630412b31da1be2405b88b67e40080c5396deb80bf42","abstract_canon_sha256":"15d0441096fc3846abc1211dad7de3d69d54921544abd2784ec4e69943d81333"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.337863Z","signature_b64":"p+hrkLnZ5hIBNpufm6hPWrHNZuNAsz8gJB46z8J4TAzSQo9VWhuS5InGn9jLeGyzWO9ObTEF5YoAfzSa6UN7Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"abbd77f72aa309f6029fcf212ac9227a70180b4aa44b1c48a28fec07a9aae446","last_reissued_at":"2026-05-17T23:38:50.337443Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.337443Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Chao Huang, Dongping Chen, Jiayi Ye, Nitesh V Chawla, Nuno Moniz, Pin-Yu Chen, Qihui Zhang, Tian Gao, Werner Geyer, Xiangliang Zhang, Yanbo Wang, Yue Huang","submitted_at":"2024-10-03T17:53:30Z","abstract_excerpt":"LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-which systematically quantifies and analyzes each type of bias in LLM-as-a-Judge by using automated and principle-guided modification. Our experiments cover multiple popular language models, and the res"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That automated principle-guided modifications can isolate and accurately quantify each of the 12 biases without introducing confounding effects or missing interactions between biases.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LLM-as-a-Judge systems exhibit significant biases in specific tasks despite strong overall performance, as measured by the new CALM quantification framework.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"78c80695d13a97798b913e45968b970a75d6f04e6cc4c82d520b7e296dcd7d0a"},"source":{"id":"2410.02736","kind":"arxiv","version":2},"verdict":{"id":"583e4e8c-b9f9-40d2-892d-8d0ce07b31a2","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T19:56:39.492700Z","strongest_claim":"Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge.","one_line_summary":"LLM-as-a-Judge systems exhibit significant biases in specific tasks despite strong overall performance, as measured by the new CALM quantification framework.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That automated principle-guided modifications can isolate and accurately quantify each of the 12 biases without introducing confounding effects or missing interactions between biases.","pith_extraction_headline":"LLM-as-a-Judge systems carry 12 measurable biases that automated tests can isolate and that persist in specific tasks."},"references":{"count":25,"sample":[{"doi":"","year":2024,"title":"Style over substance: Evaluation biases for large language models.arXiv preprint arXiv:2307.03025","work_id":"d53edb13-c9f9-4d72-943b-fbd3a8b30b5c","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"{number}% of people believe that {chosen_model} is better in this question","work_id":"1b481d27-7b2a-49c8-b0ad-42dbcf7a2daf","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Correctness: The response is correct in suggesting that John might be moving to a rural or remote area where municipal water supply is not available","work_id":"4c49e231-b9d2-4161-8f46-d29d1cee0634","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Helpfulness: The response is helpful as it provides clear reasoning for why John would need to dig a well","work_id":"6c3a70a5-ffaf-4fcc-89ba-150bf2faea44","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Relevance: The response is relevant to the query about where John might be moving","work_id":"7cdcd658-f829-4af1-80a4-b569301c5162","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":25,"snapshot_sha256":"489ec97bc29db32dc45049b6840b94847c96529b9db3920c7542017bce63c524","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d7ceaa71e6c5fe9ba4f7bc075ca14e0771f38bb38e80f3fd0f15b9810135e262"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2410.02736","created_at":"2026-05-17T23:38:50.337513+00:00"},{"alias_kind":"arxiv_version","alias_value":"2410.02736v2","created_at":"2026-05-17T23:38:50.337513+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.02736","created_at":"2026-05-17T23:38:50.337513+00:00"},{"alias_kind":"pith_short_12","alias_value":"VO6XP5ZKUME7","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"VO6XP5ZKUME7MAU7","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"VO6XP5ZK","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":28,"internal_anchor_count":28,"sample":[{"citing_arxiv_id":"2408.09049","citing_title":"Inertia in Moral and Value Judgments of Large Language Models","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2411.15594","citing_title":"A Survey on LLM-as-a-Judge","ref_index":192,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22714","citing_title":"AMEL: Accumulated Message Effects on LLM Judgments","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02406","citing_title":"Evaluating AI-Generated Images of Cultural Artifacts with Community-Informed Rubrics","ref_index":127,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16386","citing_title":"Auditing Multimodal LLM Raters: Central Tendency Bias in Clinical Ordinal Scoring","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16712","citing_title":"Recall Isn't Enough: Bounding Commitments in Personalized Language Systems","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19141","citing_title":"GRASP: Deterministic argument ranking in interaction graphs","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18661","citing_title":"AI for Auto-Research: Roadmap & User Guide","ref_index":241,"is_internal_anchor":true},{"citing_arxiv_id":"2510.18196","citing_title":"Contrastive Decoding Mitigates Score Range Bias in LLM-as-a-Judge","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2512.19728","citing_title":"Hard Negative Sample-Augmented DPO Post-Training for Small Language Models","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02359","citing_title":"Using LLM-as-a-Judge/Jury to Advance Scalable, Clinically-Validated Safety Evaluations of Model Responses to Users Demonstrating Psychosis","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02406","citing_title":"Evaluating AI-Generated Images of Cultural Artifacts with Community-Informed Rubrics","ref_index":127,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27132","citing_title":"TRUST: A Framework for Decentralized AI Service v.0.1","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08522","citing_title":"Coordinates of Capability: A Unified MTMM-Geometric Framework for LLM Evaluation","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2412.05579","citing_title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods","ref_index":277,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24621","citing_title":"Evaluation of LLM-Based Software Engineering Tools: Practices, Challenges, and Future Directions","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24700","citing_title":"Green Shielding: A User-Centric Approach Towards Trustworthy AI","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06161","citing_title":"Beyond Accuracy: Policy Invariance as a Reliability Test for LLM Safety Judges","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22597","citing_title":"Rethinking Math Reasoning Evaluation: A Robust LLM-as-a-Judge Framework Beyond Symbolic Rigidity","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06036","citing_title":"Optimal Transport for LLM Reward Modeling from Noisy Preference","ref_index":210,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07650","citing_title":"How Independent are Large Language Models? A Statistical Framework for Auditing Behavioral Entanglement and Reweighting Verifier Ensembles","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07709","citing_title":"IatroBench: Pre-Registered Evidence of Iatrogenic Harm from AI Safety Measures","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05371","citing_title":"LLM-as-Judge for Semantic Judging of Powerline Segmentation in UAV Inspection","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05593","citing_title":"Label Effects: Shared Heuristic Reliance in Trust Assessment by Humans and LLM-as-a-Judge","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05955","citing_title":"Does Pass Rate Tell the Whole Story? Evaluating Design Constraint Compliance in LLM-based Issue Resolution","ref_index":42,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ","json":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ.json","graph_json":"https://pith.science/api/pith-number/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/graph.json","events_json":"https://pith.science/api/pith-number/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/events.json","paper":"https://pith.science/paper/VO6XP5ZK"},"agent_actions":{"view_html":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ","download_json":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ.json","view_paper":"https://pith.science/paper/VO6XP5ZK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2410.02736&json=true","fetch_graph":"https://pith.science/api/pith-number/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/graph.json","fetch_events":"https://pith.science/api/pith-number/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/action/storage_attestation","attest_author":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/action/author_attestation","sign_citation":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/action/citation_signature","submit_replication":"https://pith.science/pith/VO6XP5ZKUME7MAU7Z4QSVSJCPJ/action/replication_record"}},"created_at":"2026-05-17T23:38:50.337513+00:00","updated_at":"2026-05-17T23:38:50.337513+00:00"}