{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:L2KZ6GFWAF2IBCT3ZJGMTHYDRF","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"a356867732d5cd804f47160ad9c6aef81f170696de37075918f5dfdbe0367e1c","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:48:16Z","title_canon_sha256":"c4779848ed5081b59722c936a564a648a6a165e56430d251b11740f808565f23"},"schema_version":"1.0","source":{"id":"2605.13695","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13695","created_at":"2026-05-18T02:44:16Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13695v1","created_at":"2026-05-18T02:44:16Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13695","created_at":"2026-05-18T02:44:16Z"},{"alias_kind":"pith_short_12","alias_value":"L2KZ6GFWAF2I","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"L2KZ6GFWAF2IBCT3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"L2KZ6GFW","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:3e8390366f800cc3d1ef0aa906f11124bb4a81fa231257633e4ee382c310c2bb","target":"graph","created_at":"2026-05-18T02:44:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"On JudgeBench-GPT (350 hard pairwise items), Claude 3.7 Sonnet's pairwise accuracy climbs from 64.6% (single-shot vanilla prompt) to 78.6% (RTLC critique-of-10) -- an absolute 14.0-percentage-point gain."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the reported accuracy gains are driven by the specific RTLC stages rather than increased token budget, model-specific behavior, or benchmark idiosyncrasies, and that the high-level stage descriptions translate to reproducible prompts."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"RTLC prompting lifts Claude 3.7 Sonnet pairwise accuracy on 350 hard JudgeBench items from 64.6% to 78.6% via a Research-Teach-Critique scaffold that beats self-consistency."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A three-stage prompting method lifts LLM judge accuracy from 65% to 79% on hard pairwise comparisons."}],"snapshot_sha256":"1f8360bf3e1fdcfe4c0292cb93aba3641979ef47104dde83b5e6e6c9d4137993"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"LLM-as-a-judge is now the default measurement instrument for open-ended generation, but on the public JudgeBench benchmark even strong instruction-tuned judges barely scrape past random on objective-correctness pairwise items. We introduce RTLC, a three-stage prompting recipe -- Research, Teach-to-Learn, Critique -- that promotes a single black-box LLM into an ensemble-of-thought judge with no fine-tuning, retrieval, or external tools. Stage 1 wraps the input in a fixed pedagogical scaffold porting the Feynman Learning Technique (study $\\to$ teach $\\to$ find gaps $\\to$ simplify) into LLM promp","authors_text":"Andrea Morandi","cross_cats":["cs.AI"],"headline":"A three-stage prompting method lifts LLM judge accuracy from 65% to 79% on hard pairwise comparisons.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:48:16Z","title":"RTLC -- Research, Teach-to-Learn, Critique: A three-stage prompting paradigm inspired by the Feynman Learning Technique that lifts LLM-as-judge accuracy on JudgeBench with no fine-tuning"},"references":{"count":10,"internal_anchors":5,"resolved_work":10,"sample":[{"cited_arxiv_id":"2410.12784","doi":"","is_internal_anchor":true,"ref_index":1,"title":"JudgeBench: A Benchmark for Evaluating LLM-based Judges","work_id":"7255b223-8380-468c-9951-e1617432eb73","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Two ways to de-bias an LLM-as-a-Judge: A continuous- score comparison of hierarchical Bayesian calibration and Neural-ODE score transport,","work_id":"ff0ca1ec-586f-4a0a-93f4-85f3b089f968","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena","work_id":"cccce1f9-7736-4f3d-8edd-e8144f4dd4b0","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"UltraFeedback: Boosting language models with scaled AI feedback,","work_id":"f52ce90f-710f-46f7-b2e5-bf897147b498","year":2024},{"cited_arxiv_id":"2203.11171","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Self-Consistency Improves Chain of Thought Reasoning in Language Models","work_id":"8c6d5a6b-b5cc-4105-9c84-9c34bb9375bb","year":2023}],"snapshot_sha256":"8ceda67c5036921e5336dc75d7d24f02f81f7056a3a9087c77cdb44db396f195"},"source":{"id":"2605.13695","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:33:16.648397Z","id":"e822b72b-d1c5-4474-8f83-a3888ecdfb48","model_set":{"reader":"grok-4.3"},"one_line_summary":"RTLC prompting lifts Claude 3.7 Sonnet pairwise accuracy on 350 hard JudgeBench items from 64.6% to 78.6% via a Research-Teach-Critique scaffold that beats self-consistency.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A three-stage prompting method lifts LLM judge accuracy from 65% to 79% on hard pairwise comparisons.","strongest_claim":"On JudgeBench-GPT (350 hard pairwise items), Claude 3.7 Sonnet's pairwise accuracy climbs from 64.6% (single-shot vanilla prompt) to 78.6% (RTLC critique-of-10) -- an absolute 14.0-percentage-point gain.","weakest_assumption":"That the reported accuracy gains are driven by the specific RTLC stages rather than increased token budget, model-specific behavior, or benchmark idiosyncrasies, and that the high-level stage descriptions translate to reproducible prompts."}},"verdict_id":"e822b72b-d1c5-4474-8f83-a3888ecdfb48"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:7b5cba4a6c10319b751d2c17084e43cd6372576a7a8a869ea0df1171b1feebd3","target":"record","created_at":"2026-05-18T02:44:16Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"a356867732d5cd804f47160ad9c6aef81f170696de37075918f5dfdbe0367e1c","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:48:16Z","title_canon_sha256":"c4779848ed5081b59722c936a564a648a6a165e56430d251b11740f808565f23"},"schema_version":"1.0","source":{"id":"2605.13695","kind":"arxiv","version":1}},"canonical_sha256":"5e959f18b60174808a7bca4cc99f038947fd79f6f4dd028bd38b851e3f8fb831","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"5e959f18b60174808a7bca4cc99f038947fd79f6f4dd028bd38b851e3f8fb831","first_computed_at":"2026-05-18T02:44:16.910577Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:16.910577Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"JPY+CBof+cylYrJR8p8cexUxvCAf7uXugPhXOnmw2SBr7mHNtjHlWp+BrTZ0MKAtozSxns3LJKZOVQJSanZcCA==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:16.911085Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13695","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:7b5cba4a6c10319b751d2c17084e43cd6372576a7a8a869ea0df1171b1feebd3","sha256:3e8390366f800cc3d1ef0aa906f11124bb4a81fa231257633e4ee382c310c2bb"],"state_sha256":"19dd287097f857d2f1e111d511d8f0a04f73dfbad920581ed7aaa04669317b1d"}