{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:Q6LPEAMDYTVMDFIZ5LF267CSMG","short_pith_number":"pith:Q6LPEAMD","schema_version":"1.0","canonical_sha256":"8796f20183c4eac19519eacbaf7c5261ab3dc8dba448bc1bd90f5875424e434a","source":{"kind":"arxiv","id":"2404.13076","version":1},"attestation_state":"computed","paper":{"title":"LLM Evaluators Recognize and Favor Their Own Generations","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Arjun Panickssery, Samuel R. Bowman, Shi Feng","submitted_at":"2024-04-15T16:49:59Z","abstract_excerpt":"Self-evaluation using large language models (LLMs) has proven valuable not only in benchmarking but also methods like reward modeling, constitutional AI, and self-refinement. But new biases are introduced due to the same LLM acting as both the evaluator and the evaluatee. One such bias is self-preference, where an LLM evaluator scores its own outputs higher than others' while human annotators consider them of equal quality. But do LLMs actually recognize their own outputs when they give those texts higher scores, or is it just a coincidence? In this paper, we investigate if self-recognition ca"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2404.13076","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-04-15T16:49:59Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"515f95f1cdfd1496b2c014e6f5952537a265a6235804636bfa7ef96562515622","abstract_canon_sha256":"ffebaa14a97d3d1ef8a2cfc5eaa551b244bacb5b400e5cd720bb46621ebe2b36"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T18:40:04.775271Z","signature_b64":"hLruYd3j2QUvJmpNHj6+4cGzaktePUaSBVzlCgN9g7W1eKO183SDqUqgL3vyTIdYFpzr0pYcKMw1VR2pWtxPCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8796f20183c4eac19519eacbaf7c5261ab3dc8dba448bc1bd90f5875424e434a","last_reissued_at":"2026-05-22T18:40:04.772304Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T18:40:04.772304Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LLM Evaluators Recognize and Favor Their Own Generations","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Arjun Panickssery, Samuel R. Bowman, Shi Feng","submitted_at":"2024-04-15T16:49:59Z","abstract_excerpt":"Self-evaluation using large language models (LLMs) has proven valuable not only in benchmarking but also methods like reward modeling, constitutional AI, and self-refinement. But new biases are introduced due to the same LLM acting as both the evaluator and the evaluatee. One such bias is self-preference, where an LLM evaluator scores its own outputs higher than others' while human annotators consider them of equal quality. But do LLMs actually recognize their own outputs when they give those texts higher scores, or is it just a coincidence? In this paper, we investigate if self-recognition ca"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2404.13076","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2404.13076/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2404.13076","created_at":"2026-05-22T18:40:04.772399+00:00"},{"alias_kind":"arxiv_version","alias_value":"2404.13076v1","created_at":"2026-05-22T18:40:04.772399+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.13076","created_at":"2026-05-22T18:40:04.772399+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q6LPEAMDYTVM","created_at":"2026-05-22T18:40:04.772399+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q6LPEAMDYTVMDFIZ","created_at":"2026-05-22T18:40:04.772399+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q6LPEAMD","created_at":"2026-05-22T18:40:04.772399+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":25,"internal_anchor_count":25,"sample":[{"citing_arxiv_id":"2501.09775","citing_title":"Multiple Choice Questions: Reasoning Makes Large Language Models (LLMs) More Self-Confident, Especially When They are Wrong","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2504.14044","citing_title":"Multi-Stage Retrieval for Operational Technology Cybersecurity Compliance Using Large Language Models: A Railway Casestudy","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22502","citing_title":"Compiling Agentic Workflows into LLM Weights: Near-Frontier Quality at Two Orders of Magnitude Less Cost","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22714","citing_title":"AMEL: Accumulated Message Effects on LLM Judgments","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20312","citing_title":"Pramana: A Protocol-Layer Treatment of Claim Verification in Autonomous Agent Networks","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19099","citing_title":"DecisionBench: A Benchmark for Emergent Delegation in Long-Horizon Agentic Workflows","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19220","citing_title":"Position: Uncertainty Quantification in LLMs is Just Unsupervised Clustering","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19529","citing_title":"Generative-Evaluative Agreement: A Necessary Validity Criterion for LLM-Enabled Adaptive Assessment","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10810","citing_title":"Likelihood scoring for continuations of mathematical text: a self-supervised benchmark with tests for shortcut vulnerabilities","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2511.01490","citing_title":"Synthetic Eggs in Many Baskets: The Impact of Synthetic Data Diversity on LLM Fine-Tuning","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16310","citing_title":"RAG-DIVE: A Dynamic Approach for Multi-Turn Dialogue Evaluation in Retrieval-Augmented Generation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2410.21819","citing_title":"Self-Preference Bias in LLM-as-a-Judge","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14517","citing_title":"Dimension-Level Intent Fidelity Evaluation for Large Language Models: Evidence from Structured Prompt Ablation","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06233","citing_title":"Blind Refusal: Language Models Refuse to Help Users Evade Unjust, Absurd, and Illegitimate Rules","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04539","citing_title":"RLearner-LLM: Balancing Logical Grounding and Fluency in Large Language Models via Hybrid Direct Preference Optimization","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04539","citing_title":"RLearner-LLM: Balancing Logical Grounding and Fluency in Large Language Models via Hybrid Direct Preference Optimization","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25213","citing_title":"When the Forger Is the Judge: GPT-Image-2 Cannot Recognize Its Own Faked Documents","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2412.05579","citing_title":"LLMs-as-Judges: A Comprehensive Survey on LLM-based Evaluation Methods","ref_index":177,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24544","citing_title":"STELLAR-E: a Synthetic, Tailored, End-to-end LLM Application Rigorous Evaluator","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24346","citing_title":"SycoPhantasy: Quantifying Sycophancy and Hallucination in Small Open Weight VLMs for Vision-Language Scoring of Fantasy Characters","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23178","citing_title":"Judging the Judges: A Systematic Evaluation of Bias Mitigation Strategies in LLM-as-a-Judge Pipelines","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06390","citing_title":"Automated alignment is harder than you think","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04539","citing_title":"RLearner-LLM: Balancing Logical Grounding and Fluency in Large Language Models via Hybrid Direct Preference Optimization","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01311","citing_title":"The Partial Testimony of Logs: Evaluation of Language Model Generation under Confounded Model Choice","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06996","citing_title":"Self-Preference Bias in Rubric-Based Evaluation of Large Language Models","ref_index":11,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG","json":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG.json","graph_json":"https://pith.science/api/pith-number/Q6LPEAMDYTVMDFIZ5LF267CSMG/graph.json","events_json":"https://pith.science/api/pith-number/Q6LPEAMDYTVMDFIZ5LF267CSMG/events.json","paper":"https://pith.science/paper/Q6LPEAMD"},"agent_actions":{"view_html":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG","download_json":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG.json","view_paper":"https://pith.science/paper/Q6LPEAMD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2404.13076&json=true","fetch_graph":"https://pith.science/api/pith-number/Q6LPEAMDYTVMDFIZ5LF267CSMG/graph.json","fetch_events":"https://pith.science/api/pith-number/Q6LPEAMDYTVMDFIZ5LF267CSMG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG/action/storage_attestation","attest_author":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG/action/author_attestation","sign_citation":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG/action/citation_signature","submit_replication":"https://pith.science/pith/Q6LPEAMDYTVMDFIZ5LF267CSMG/action/replication_record"}},"created_at":"2026-05-22T18:40:04.772399+00:00","updated_at":"2026-05-22T18:40:04.772399+00:00"}