{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WZNM7LOHONQGWMACSEN5BJ53ZP","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"efa8850c1cee582682bf8b6e383f1415d35e672a8619f1d30c73dc1d426fa080","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-03-25T17:56:55Z","title_canon_sha256":"a5ef48a112ad5f0e5bab1bb3887b3648d3139c5ac89cfd79c1119edae3fe1eab"},"schema_version":"1.0","source":{"id":"2603.24586","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.24586","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"arxiv_version","alias_value":"2603.24586v2","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.24586","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"pith_short_12","alias_value":"WZNM7LOHONQG","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WZNM7LOHONQGWMAC","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WZNM7LOH","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:bf289fe80942efd87ef897e46dfff02494773499ebf47f4b47a37c15a937121b","target":"graph","created_at":"2026-05-17T23:38:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Among 13 different models, the best judges underperform human annotators by 12-23%. TRACE identifies 35 significant sources of misalignment between humans and judges across interaction modalities, the majority of which correspond to existing software engineering code quality criteria."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Human preferences collected via annotation are treated as the ground truth without significant bias or inconsistency, and the automatic rubric extraction accurately isolates the sources of misalignment without introducing artifacts from the extraction process itself."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"TRACE shows LLM judges underperform human annotators by 12-23% and misalign on 35 code quality dimensions across three coding modalities, with biases often matching existing software engineering criteria."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LLM judges underperform human annotators by 12-23% when predicting developer code preferences across realistic tasks."}],"snapshot_sha256":"af77360836c3d3583081298d9f5e4a1a3f6e0db3285ba04fdc887a88c418e86f"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"As LLMs are increasingly used as judges in code applications, they should be evaluated in realistic interactive settings that capture partial context and ambiguous intent. We present TRACE (Tool for Rubric Analysis in Code Evaluation), a framework that evaluates LLM judges' ability to predict human preferences and automatically extracts rubric items to reveal systematic biases in how humans and models weigh each item. Across three modalities -- chat-based programming, IDE autocompletion, and instructed code editing -- we use TRACE to measure how well LLM judges align with developer preferences","authors_text":"Aditya Mittal, Ameet Talwalkar, Chris Donahue, Ryan Shar, Shyam Agarwal, Tongshuang Wu, Valerie Chen, Wayne Chi, Zichu Wu","cross_cats":["cs.CL"],"headline":"LLM judges underperform human annotators by 12-23% when predicting developer code preferences across realistic tasks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-03-25T17:56:55Z","title":"Comparing Developer and LLM Biases in Code Evaluation"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.24586","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T07:12:04.487848Z","id":"170e97b0-4ffd-4625-915d-4eb725db6ef3","model_set":{"reader":"grok-4.3"},"one_line_summary":"TRACE shows LLM judges underperform human annotators by 12-23% and misalign on 35 code quality dimensions across three coding modalities, with biases often matching existing software engineering criteria.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LLM judges underperform human annotators by 12-23% when predicting developer code preferences across realistic tasks.","strongest_claim":"Among 13 different models, the best judges underperform human annotators by 12-23%. TRACE identifies 35 significant sources of misalignment between humans and judges across interaction modalities, the majority of which correspond to existing software engineering code quality criteria.","weakest_assumption":"Human preferences collected via annotation are treated as the ground truth without significant bias or inconsistency, and the automatic rubric extraction accurately isolates the sources of misalignment without introducing artifacts from the extraction process itself."}},"verdict_id":"170e97b0-4ffd-4625-915d-4eb725db6ef3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:43494f6c8a4b7000fd03981bed5d03eb40b7d4d2762cdc6c7aacb8c63b9dd818","target":"record","created_at":"2026-05-17T23:38:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"efa8850c1cee582682bf8b6e383f1415d35e672a8619f1d30c73dc1d426fa080","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-03-25T17:56:55Z","title_canon_sha256":"a5ef48a112ad5f0e5bab1bb3887b3648d3139c5ac89cfd79c1119edae3fe1eab"},"schema_version":"1.0","source":{"id":"2603.24586","kind":"arxiv","version":2}},"canonical_sha256":"b65acfadc773606b3002911bd0a7bbcbeb6e4e65b1152da91982af76f5399148","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b65acfadc773606b3002911bd0a7bbcbeb6e4e65b1152da91982af76f5399148","first_computed_at":"2026-05-17T23:38:59.541346Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:59.541346Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"fylHGfgGL0F1qCxMzf2FdxPQFpI74Sm9KrUTq7IrP28VbiD01ola5MHg8YHpQ+tdn7JCsZkhhtx1q1Y5+A1hBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:59.542116Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.24586","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:43494f6c8a4b7000fd03981bed5d03eb40b7d4d2762cdc6c7aacb8c63b9dd818","sha256:bf289fe80942efd87ef897e46dfff02494773499ebf47f4b47a37c15a937121b"],"state_sha256":"8feed6367f9cc92b5bfbb0889eb235c3e4af92e476b36faf1098770bb3524b75"}