{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:AWGGH2D4NG3B6I2RCZ5MDLU535","short_pith_number":"pith:AWGGH2D4","canonical_record":{"source":{"id":"2601.03630","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-07T06:19:26Z","cross_cats_sorted":[],"title_canon_sha256":"815ace6fd9ed995b8703dba2203cf04166011cfe3041c0dda4ee93706a84ff21","abstract_canon_sha256":"07229dbd403bf02fe1ca97dd890d85b09906bc427ad26ba8a4cb77eca547b26a"},"schema_version":"1.0"},"canonical_sha256":"058c63e87c69b61f2351167ac1ae9ddf6bd0db3118300fbbfbddc82c4a84b427","source":{"kind":"arxiv","id":"2601.03630","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.03630","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"arxiv_version","alias_value":"2601.03630v2","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.03630","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"pith_short_12","alias_value":"AWGGH2D4NG3B","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AWGGH2D4NG3B6I2R","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AWGGH2D4","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:AWGGH2D4NG3B6I2RCZ5MDLU535","target":"record","payload":{"canonical_record":{"source":{"id":"2601.03630","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-07T06:19:26Z","cross_cats_sorted":[],"title_canon_sha256":"815ace6fd9ed995b8703dba2203cf04166011cfe3041c0dda4ee93706a84ff21","abstract_canon_sha256":"07229dbd403bf02fe1ca97dd890d85b09906bc427ad26ba8a4cb77eca547b26a"},"schema_version":"1.0"},"canonical_sha256":"058c63e87c69b61f2351167ac1ae9ddf6bd0db3118300fbbfbddc82c4a84b427","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:16.710763Z","signature_b64":"/8IxzJUAPdmqADdP8ES0DoMZEU6P1KCI0ucPW4frRrjs+RiepkqW1Bav9Z6hBd5gzNaUzjArBoYQXpb0qd2kAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"058c63e87c69b61f2351167ac1ae9ddf6bd0db3118300fbbfbddc82c4a84b427","last_reissued_at":"2026-05-17T23:39:16.710192Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:16.710192Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2601.03630","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cFqFbc6+ZvlUbMarIEL6FvFTefZ1i7QgdKyoqAKZ3H6IhtmlpyfUTlLUvlJNmZb3XCm425Q6b0tb7lfS+oUPAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T19:37:33.261230Z"},"content_sha256":"cd5b1a35cccd49ee684e5c1ff9a5ef3a07e579d00917b02a2025438a7079ead1","schema_version":"1.0","event_id":"sha256:cd5b1a35cccd49ee684e5c1ff9a5ef3a07e579d00917b02a2025438a7079ead1"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:AWGGH2D4NG3B6I2RCZ5MDLU535","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Reasoning Model Is Superior LLM-Judge, Yet Suffers from Biases","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"Large reasoning models outperform standard LLMs as judges on accuracy and robustness but still carry strong evaluation biases that an explicit planning step can reduce.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Hui Huang, Muyun Yang, Xuanxin Wu, Yuki Arase","submitted_at":"2026-01-07T06:19:26Z","abstract_excerpt":"This paper presents the first systematic comparison investigating whether Large Reasoning Models (LRMs) are superior judges to non-reasoning LLMs. Our empirical analysis yields four key findings: 1) LRMs outperform non-reasoning LLMs in terms of judgment accuracy, particularly on reasoning-intensive tasks; 2) LRMs demonstrate superior evaluation instruction-following capabilities; 3) LRMs exhibit enhanced robustness against adversarial attacks targeting judgment tasks; 4) However, LRMs still exhibit strong evaluation biases. To mitigate this bias vulnerability, we propose PlanJudge, a lightwei"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"LRMs outperform non-reasoning LLMs in judgment accuracy, particularly on reasoning-intensive tasks, demonstrate superior instruction-following and robustness, yet still exhibit strong evaluation biases that PlanJudge mitigates while preserving accuracy.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the chosen tasks, adversarial attacks, and bias metrics comprehensively capture real-world judgment scenarios and that observed improvements generalize beyond the tested models and datasets.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Reasoning models judge better than non-reasoning LLMs yet retain biases; generating an evaluation plan first mitigates bias without losing accuracy.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large reasoning models outperform standard LLMs as judges on accuracy and robustness but still carry strong evaluation biases that an explicit planning step can reduce.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3de6081e860dd93d181654d98c34111e51e81d105489df81c37d07e5720d9adf"},"source":{"id":"2601.03630","kind":"arxiv","version":2},"verdict":{"id":"d134c7ab-42ab-4aec-9c54-d29387a47ef1","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T17:14:40.009656Z","strongest_claim":"LRMs outperform non-reasoning LLMs in judgment accuracy, particularly on reasoning-intensive tasks, demonstrate superior instruction-following and robustness, yet still exhibit strong evaluation biases that PlanJudge mitigates while preserving accuracy.","one_line_summary":"Reasoning models judge better than non-reasoning LLMs yet retain biases; generating an evaluation plan first mitigates bias without losing accuracy.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the chosen tasks, adversarial attacks, and bias metrics comprehensively capture real-world judgment scenarios and that observed improvements generalize beyond the tested models and datasets.","pith_extraction_headline":"Large reasoning models outperform standard LLMs as judges on accuracy and robustness but still carry strong evaluation biases that an explicit planning step can reduce."},"references":{"count":12,"sample":[{"doi":"","year":2025,"title":"InFindings of the Association for Computational Linguistics: ACL 2025, pages 5880–5895","work_id":"61bb32c1-ce49-4dfa-a311-d0ba4eeb1709","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Zhang, Makesh Narsimhan Sreedhar, and Oleksii Kuchaiev","work_id":"a48d5ada-8623-467d-baf4-93bd47703121","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Are reasoning models more prone to hallucination?","work_id":"cbb24d0e-0a95-46cb-a43e-115b3c4115d7","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Planning: A detailed evaluation plan is specified based on the current evaluation task","work_id":"1630ead5-cdf4-4899-ae6d-c35802dd91ed","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"We investigate three distinct strategies for the first step of plan generation:","work_id":"903f762c-74ef-4a5e-a50c-0e26882caa22","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":12,"snapshot_sha256":"1860e2cff397bae7d9f78cbc8e77b854d9560dad92b597496023dcfbcf3fe2ac","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"d134c7ab-42ab-4aec-9c54-d29387a47ef1"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:16Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cG1YJivDUfKPg6B9b/loS6ZYXNU6cKd3nrMjzXCGBe1qbRi1Vy2YP5blTwrFN6kat60Euk+zw3TR3NVFMKdWCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T19:37:33.261891Z"},"content_sha256":"ca9b936a1c758ff6df6059601f7dfb8c4eed66a89a6a22b1e47a51e256f2872e","schema_version":"1.0","event_id":"sha256:ca9b936a1c758ff6df6059601f7dfb8c4eed66a89a6a22b1e47a51e256f2872e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/AWGGH2D4NG3B6I2RCZ5MDLU535/bundle.json","state_url":"https://pith.science/pith/AWGGH2D4NG3B6I2RCZ5MDLU535/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/AWGGH2D4NG3B6I2RCZ5MDLU535/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-19T19:37:33Z","links":{"resolver":"https://pith.science/pith/AWGGH2D4NG3B6I2RCZ5MDLU535","bundle":"https://pith.science/pith/AWGGH2D4NG3B6I2RCZ5MDLU535/bundle.json","state":"https://pith.science/pith/AWGGH2D4NG3B6I2RCZ5MDLU535/state.json","well_known_bundle":"https://pith.science/.well-known/pith/AWGGH2D4NG3B6I2RCZ5MDLU535/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:AWGGH2D4NG3B6I2RCZ5MDLU535","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"07229dbd403bf02fe1ca97dd890d85b09906bc427ad26ba8a4cb77eca547b26a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-07T06:19:26Z","title_canon_sha256":"815ace6fd9ed995b8703dba2203cf04166011cfe3041c0dda4ee93706a84ff21"},"schema_version":"1.0","source":{"id":"2601.03630","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2601.03630","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"arxiv_version","alias_value":"2601.03630v2","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.03630","created_at":"2026-05-17T23:39:16Z"},{"alias_kind":"pith_short_12","alias_value":"AWGGH2D4NG3B","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"AWGGH2D4NG3B6I2R","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"AWGGH2D4","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ca9b936a1c758ff6df6059601f7dfb8c4eed66a89a6a22b1e47a51e256f2872e","target":"graph","created_at":"2026-05-17T23:39:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"LRMs outperform non-reasoning LLMs in judgment accuracy, particularly on reasoning-intensive tasks, demonstrate superior instruction-following and robustness, yet still exhibit strong evaluation biases that PlanJudge mitigates while preserving accuracy."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the chosen tasks, adversarial attacks, and bias metrics comprehensively capture real-world judgment scenarios and that observed improvements generalize beyond the tested models and datasets."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Reasoning models judge better than non-reasoning LLMs yet retain biases; generating an evaluation plan first mitigates bias without losing accuracy."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large reasoning models outperform standard LLMs as judges on accuracy and robustness but still carry strong evaluation biases that an explicit planning step can reduce."}],"snapshot_sha256":"3de6081e860dd93d181654d98c34111e51e81d105489df81c37d07e5720d9adf"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"This paper presents the first systematic comparison investigating whether Large Reasoning Models (LRMs) are superior judges to non-reasoning LLMs. Our empirical analysis yields four key findings: 1) LRMs outperform non-reasoning LLMs in terms of judgment accuracy, particularly on reasoning-intensive tasks; 2) LRMs demonstrate superior evaluation instruction-following capabilities; 3) LRMs exhibit enhanced robustness against adversarial attacks targeting judgment tasks; 4) However, LRMs still exhibit strong evaluation biases. To mitigate this bias vulnerability, we propose PlanJudge, a lightwei","authors_text":"Hui Huang, Muyun Yang, Xuanxin Wu, Yuki Arase","cross_cats":[],"headline":"Large reasoning models outperform standard LLMs as judges on accuracy and robustness but still carry strong evaluation biases that an explicit planning step can reduce.","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-07T06:19:26Z","title":"Reasoning Model Is Superior LLM-Judge, Yet Suffers from Biases"},"references":{"count":12,"internal_anchors":0,"resolved_work":12,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"InFindings of the Association for Computational Linguistics: ACL 2025, pages 5880–5895","work_id":"61bb32c1-ce49-4dfa-a311-d0ba4eeb1709","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Zhang, Makesh Narsimhan Sreedhar, and Oleksii Kuchaiev","work_id":"a48d5ada-8623-467d-baf4-93bd47703121","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Are reasoning models more prone to hallucination?","work_id":"cbb24d0e-0a95-46cb-a43e-115b3c4115d7","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Planning: A detailed evaluation plan is specified based on the current evaluation task","work_id":"1630ead5-cdf4-4899-ae6d-c35802dd91ed","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"We investigate three distinct strategies for the first step of plan generation:","work_id":"903f762c-74ef-4a5e-a50c-0e26882caa22","year":null}],"snapshot_sha256":"1860e2cff397bae7d9f78cbc8e77b854d9560dad92b597496023dcfbcf3fe2ac"},"source":{"id":"2601.03630","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T17:14:40.009656Z","id":"d134c7ab-42ab-4aec-9c54-d29387a47ef1","model_set":{"reader":"grok-4.3"},"one_line_summary":"Reasoning models judge better than non-reasoning LLMs yet retain biases; generating an evaluation plan first mitigates bias without losing accuracy.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large reasoning models outperform standard LLMs as judges on accuracy and robustness but still carry strong evaluation biases that an explicit planning step can reduce.","strongest_claim":"LRMs outperform non-reasoning LLMs in judgment accuracy, particularly on reasoning-intensive tasks, demonstrate superior instruction-following and robustness, yet still exhibit strong evaluation biases that PlanJudge mitigates while preserving accuracy.","weakest_assumption":"That the chosen tasks, adversarial attacks, and bias metrics comprehensively capture real-world judgment scenarios and that observed improvements generalize beyond the tested models and datasets."}},"verdict_id":"d134c7ab-42ab-4aec-9c54-d29387a47ef1"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:cd5b1a35cccd49ee684e5c1ff9a5ef3a07e579d00917b02a2025438a7079ead1","target":"record","created_at":"2026-05-17T23:39:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"07229dbd403bf02fe1ca97dd890d85b09906bc427ad26ba8a4cb77eca547b26a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-01-07T06:19:26Z","title_canon_sha256":"815ace6fd9ed995b8703dba2203cf04166011cfe3041c0dda4ee93706a84ff21"},"schema_version":"1.0","source":{"id":"2601.03630","kind":"arxiv","version":2}},"canonical_sha256":"058c63e87c69b61f2351167ac1ae9ddf6bd0db3118300fbbfbddc82c4a84b427","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"058c63e87c69b61f2351167ac1ae9ddf6bd0db3118300fbbfbddc82c4a84b427","first_computed_at":"2026-05-17T23:39:16.710192Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:16.710192Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/8IxzJUAPdmqADdP8ES0DoMZEU6P1KCI0ucPW4frRrjs+RiepkqW1Bav9Z6hBd5gzNaUzjArBoYQXpb0qd2kAA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:16.710763Z","signed_message":"canonical_sha256_bytes"},"source_id":"2601.03630","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:cd5b1a35cccd49ee684e5c1ff9a5ef3a07e579d00917b02a2025438a7079ead1","sha256:ca9b936a1c758ff6df6059601f7dfb8c4eed66a89a6a22b1e47a51e256f2872e"],"state_sha256":"75264c1e820bf9403edb2855888bec98bd1e2d1d12d18e020b607b97d5c2df48"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dTi8KZ0b2Q41BW3vC1go0TlREJNeeELDLC0HyUUIeG/Uu5M58ne1GgMwHVZV9BwnP1/r4bEpfMkkV1nmr2kEBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-19T19:37:33.264534Z","bundle_sha256":"1bff096972b72dfb01dc06312caa4e8df42de46497b761f2ea040997061d87f0"}}