{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:CHZ7NDL45TE2DVWPTOCNGAH53S","short_pith_number":"pith:CHZ7NDL4","schema_version":"1.0","canonical_sha256":"11f3f68d7cecc9a1d6cf9b84d300fddc845b68b61f9270bb00d407239d72d19e","source":{"kind":"arxiv","id":"2601.18777","version":1},"attestation_state":"computed","paper":{"title":"PRECISE: Reducing the Bias of LLM Evaluations Using Prediction-Powered Ranking Estimation","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.IR","stat.AP"],"primary_cat":"cs.LG","authors_text":"Abhishek Divekar, Anirban Majumder","submitted_at":"2026-01-26T18:46:49Z","abstract_excerpt":"Evaluating the quality of search, ranking and RAG systems traditionally requires a significant number of human relevance annotations. In recent times, several deployed systems have explored the usage of Large Language Models (LLMs) as automated judges for this task while their inherent biases prevent direct use for metric estimation. We present a statistical framework extending Prediction-Powered Inference (PPI) that combines minimal human annotations with LLM judgments to produce reliable estimates of metrics which require sub-instance annotations. Our method requires as few as 100 human-anno"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.18777","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-26T18:46:49Z","cross_cats_sorted":["cs.AI","cs.CL","cs.IR","stat.AP"],"title_canon_sha256":"77587d1573862b7e2ee58cd953a6f5138b40a51fb76405e496b7ab9c1d9c1b28","abstract_canon_sha256":"1af5914fe0b77f8c48e0354f9edf33714e9fe0b7b79dcba6a4f6c98f6a52651b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-04T01:09:41.132984Z","signature_b64":"eDYPJoBiOVYCFO7hxmgQI4vmHG/osqg40pSG/GUGBTnXPYn6ghE/jnFJUlORAnzYXKP+J0zWfQb4qml1RSRIDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"11f3f68d7cecc9a1d6cf9b84d300fddc845b68b61f9270bb00d407239d72d19e","last_reissued_at":"2026-06-04T01:09:41.132339Z","signature_status":"signed_v1","first_computed_at":"2026-06-04T01:09:41.132339Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"PRECISE: Reducing the Bias of LLM Evaluations Using Prediction-Powered Ranking Estimation","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.IR","stat.AP"],"primary_cat":"cs.LG","authors_text":"Abhishek Divekar, Anirban Majumder","submitted_at":"2026-01-26T18:46:49Z","abstract_excerpt":"Evaluating the quality of search, ranking and RAG systems traditionally requires a significant number of human relevance annotations. In recent times, several deployed systems have explored the usage of Large Language Models (LLMs) as automated judges for this task while their inherent biases prevent direct use for metric estimation. We present a statistical framework extending Prediction-Powered Inference (PPI) that combines minimal human annotations with LLM judgments to produce reliable estimates of metrics which require sub-instance annotations. Our method requires as few as 100 human-anno"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.18777","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.18777/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.18777","created_at":"2026-06-04T01:09:41.132420+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.18777v1","created_at":"2026-06-04T01:09:41.132420+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.18777","created_at":"2026-06-04T01:09:41.132420+00:00"},{"alias_kind":"pith_short_12","alias_value":"CHZ7NDL45TE2","created_at":"2026-06-04T01:09:41.132420+00:00"},{"alias_kind":"pith_short_16","alias_value":"CHZ7NDL45TE2DVWP","created_at":"2026-06-04T01:09:41.132420+00:00"},{"alias_kind":"pith_short_8","alias_value":"CHZ7NDL4","created_at":"2026-06-04T01:09:41.132420+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S","json":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S.json","graph_json":"https://pith.science/api/pith-number/CHZ7NDL45TE2DVWPTOCNGAH53S/graph.json","events_json":"https://pith.science/api/pith-number/CHZ7NDL45TE2DVWPTOCNGAH53S/events.json","paper":"https://pith.science/paper/CHZ7NDL4"},"agent_actions":{"view_html":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S","download_json":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S.json","view_paper":"https://pith.science/paper/CHZ7NDL4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.18777&json=true","fetch_graph":"https://pith.science/api/pith-number/CHZ7NDL45TE2DVWPTOCNGAH53S/graph.json","fetch_events":"https://pith.science/api/pith-number/CHZ7NDL45TE2DVWPTOCNGAH53S/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S/action/storage_attestation","attest_author":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S/action/author_attestation","sign_citation":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S/action/citation_signature","submit_replication":"https://pith.science/pith/CHZ7NDL45TE2DVWPTOCNGAH53S/action/replication_record"}},"created_at":"2026-06-04T01:09:41.132420+00:00","updated_at":"2026-06-04T01:09:41.132420+00:00"}