{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:MDF3373RIIMDNQJZQ5BQYYKIRB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4a9f15ac01f3cf9e3f8f70be156ffd8d06fd4f9e398b6820200a3162380b3d3d","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-29T15:33:23Z","title_canon_sha256":"9d60ce3a7ac97b31664a1f2f06e1792a1a9bce153ac2c1053b4ae652505ac363"},"schema_version":"1.0","source":{"id":"2404.18796","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2404.18796","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2404.18796v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.18796","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"MDF3373RIIMD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MDF3373RIIMDNQJZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MDF3373R","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ec3af4827b1ade60b0f96f9e010b5fe6a25022c0e84aee12f6008c270cda9452","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"using a PoLL composed of a larger number of smaller models outperforms a single large judge, exhibits less intra-model bias due to its composition of disjoint model families, and does so while being over seven times less expensive."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the collective judgments of smaller models from disjoint families can capture nuanced quality signals at least as well as a single frontier model without systematic blind spots on the evaluated tasks."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A panel of smaller diverse LLMs outperforms a single large model as an evaluator of generations, showing less intra-model bias and over 7x lower cost."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A panel of smaller diverse LLMs judges model outputs better than one large model while costing far less."}],"snapshot_sha256":"02805f7d5fa51e54d63a5dc3cc54822fc0bc1df59e83a04d6b565d8a25403e74"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"a81cadf4bda0bd5e6a33aef6b34918be4254fe641e9801cc4342a15a982b6ddf"},"paper":{"abstract_excerpt":"As Large Language Models (LLMs) have become more advanced, they have outpaced our abilities to accurately evaluate their quality. Not only is finding data to adequately probe particular model properties difficult, but evaluating the correctness of a model's freeform generation alone is a challenge. To address this, many evaluations now rely on using LLMs themselves as judges to score the quality of outputs from other LLMs. Evaluations most commonly use a single large model like GPT4. While this method has grown in popularity, it is costly, has been shown to introduce intramodel bias, and in th","authors_text":"Aleksandra Piktus, Arkady Arkhangorodsky, Minjie Xu, Naomi White, Patrick Lewis, Pat Verga, Sebastian Hofstatter, Sophia Althammer, Yixuan Su","cross_cats":["cs.AI"],"headline":"A panel of smaller diverse LLMs judges model outputs better than one large model while costing far less.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-29T15:33:23Z","title":"Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models"},"references":{"count":291,"internal_anchors":3,"resolved_work":291,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Anthropic. 2024. The claude 3 model family: Opus, sonnet, haiku","work_id":"b2994d79-7b31-437d-81e0-ab0c78132716","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. In International Conference on Learning","work_id":"7483e4be-735b-4e12-aa3d-2e1bcb6d1af4","year":2020},{"cited_arxiv_id":"","doi":"10.18653/v1/p17-1147","is_internal_anchor":false,"ref_index":5,"title":"TriviaQA: A large scale distantly supervised challenge dataset for reading comprehension","work_id":"d05a9c57-9d88-473a-aa65-efb13f9dee25","year":2017},{"cited_arxiv_id":"2004.04906","doi":"","is_internal_anchor":true,"ref_index":6,"title":"Dense Passage Retrieval for Open-Domain Question Answering","work_id":"3d6f2008-b001-4542-ba3f-192f6880c74b","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":7,"title":"Maurice G Kendall. 1938. A new measure of rank correlation. Biometrika, 30(1/2):81--93","work_id":"d65e5c0f-9765-4dc9-97fc-dc5c301f3e21","year":1938}],"snapshot_sha256":"c0107f137dbf9cc663af39ca04552b1ef9b5ee02ed026d31d728565cac86c635"},"source":{"id":"2404.18796","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T23:27:15.223901Z","id":"f3ec4dfc-1a89-4ab1-be91-28f11c01ba6f","model_set":{"reader":"grok-4.3"},"one_line_summary":"A panel of smaller diverse LLMs outperforms a single large model as an evaluator of generations, showing less intra-model bias and over 7x lower cost.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A panel of smaller diverse LLMs judges model outputs better than one large model while costing far less.","strongest_claim":"using a PoLL composed of a larger number of smaller models outperforms a single large judge, exhibits less intra-model bias due to its composition of disjoint model families, and does so while being over seven times less expensive.","weakest_assumption":"That the collective judgments of smaller models from disjoint families can capture nuanced quality signals at least as well as a single frontier model without systematic blind spots on the evaluated tasks."}},"verdict_id":"f3ec4dfc-1a89-4ab1-be91-28f11c01ba6f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:fabc7629135c4a433dff1aa2c96f29d52623e1cedca92f264bf52c2bd25719f0","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4a9f15ac01f3cf9e3f8f70be156ffd8d06fd4f9e398b6820200a3162380b3d3d","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-04-29T15:33:23Z","title_canon_sha256":"9d60ce3a7ac97b31664a1f2f06e1792a1a9bce153ac2c1053b4ae652505ac363"},"schema_version":"1.0","source":{"id":"2404.18796","kind":"arxiv","version":2}},"canonical_sha256":"60cbbdff71421836c13987430c61488868ff86841ddc1fc1b48c7811f418ffec","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"60cbbdff71421836c13987430c61488868ff86841ddc1fc1b48c7811f418ffec","first_computed_at":"2026-05-17T23:38:49.775178Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.775178Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0WTkBfiV9IOSOoCaUOctuxVj74sOrTTMZM3WZbi1mBEVGodWW9KmyC8CxCYrSLdHtK0wyAFMk4rz0VAeCIOACQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.775758Z","signed_message":"canonical_sha256_bytes"},"source_id":"2404.18796","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:fabc7629135c4a433dff1aa2c96f29d52623e1cedca92f264bf52c2bd25719f0","sha256:ec3af4827b1ade60b0f96f9e010b5fe6a25022c0e84aee12f6008c270cda9452"],"state_sha256":"2bc9d7050b87d2ab8a8228ae02a1114d2b8131c8828d407581270ac1afa419ec"}