{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Q2XUZPOLM4F32YDX64DCHRK4EG","short_pith_number":"pith:Q2XUZPOL","schema_version":"1.0","canonical_sha256":"86af4cbdcb670bbd6077f70623c55c2190c2155bbf6044378f4e4a4788fa552b","source":{"kind":"arxiv","id":"2605.13801","version":1},"attestation_state":"computed","paper":{"title":"Improving Reproducibility in Evaluation through Multi-Level Annotator Modeling","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Christopher M. Homan, Chris Welty, Deepak Pandita, Flip Korn","submitted_at":"2026-05-13T17:22:27Z","abstract_excerpt":"As generative AI models such as large language models (LLMs) become more pervasive, ensuring the safety, robustness, and overall trustworthiness of these systems is paramount. However, AI is currently facing a reproducibility crisis driven by unreliable evaluations and unrepeatable experimental results. While human raters are often used to assess models for utility and safety, they introduce divergent biases and subjective opinions into their annotations. Overcoming this variance is exceptionally challenging because very little data exists to study how experimental repeatability actually impro"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.13801","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:22:27Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a29bc2911f33f194c25dd30689f81a22a966c777e44598867deef27b86c869f5","abstract_canon_sha256":"164a9bef9af553411a717b73363bbad1d623c68a3be64100a6ded7d8d10528c4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:15.506608Z","signature_b64":"+D3An1y7xkmYrpR+SvY3NGrevhLda7pYYKc0RC8HOcw7062GCLLzoZm+YPQmB4HVubp2OZRgVqUlvQKeabwkDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"86af4cbdcb670bbd6077f70623c55c2190c2155bbf6044378f4e4a4788fa552b","last_reissued_at":"2026-05-18T02:44:15.506015Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:15.506015Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Improving Reproducibility in Evaluation through Multi-Level Annotator Modeling","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Christopher M. Homan, Chris Welty, Deepak Pandita, Flip Korn","submitted_at":"2026-05-13T17:22:27Z","abstract_excerpt":"As generative AI models such as large language models (LLMs) become more pervasive, ensuring the safety, robustness, and overall trustworthiness of these systems is paramount. However, AI is currently facing a reproducibility crisis driven by unreliable evaluations and unrepeatable experimental results. While human raters are often used to assess models for utility and safety, they introduce divergent biases and subjective opinions into their annotations. Overcoming this variance is exceptionally challenging because very little data exists to study how experimental repeatability actually impro"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"we introduce a multi-level bootstrapping approach to realistically model annotator behavior. Leveraging datasets with a large number of ratings and persistent rater identifiers, we analyze the tradeoffs between the number of items (N) and the number of responses per item (K) required to achieve statistical significance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That datasets containing large numbers of ratings per item together with persistent rater identifiers are available, representative of typical evaluation settings, and that multi-level bootstrapping accurately captures real annotator variance without introducing new artifacts.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Multi-level bootstrapping models annotator variance using large rater-ID datasets to find optimal tradeoffs between number of items N and ratings per item K for statistically significant AI evaluations.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"28769c3d507dcf35a4cdeb68064d5a37a776ea4704da8a5acbe8fb790985dc99"},"source":{"id":"2605.13801","kind":"arxiv","version":1},"verdict":{"id":"f07a563a-b5a8-4a58-af7d-b6eafe18899f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T19:27:32.733664Z","strongest_claim":"we introduce a multi-level bootstrapping approach to realistically model annotator behavior. Leveraging datasets with a large number of ratings and persistent rater identifiers, we analyze the tradeoffs between the number of items (N) and the number of responses per item (K) required to achieve statistical significance.","one_line_summary":"Multi-level bootstrapping models annotator variance using large rater-ID datasets to find optimal tradeoffs between number of items N and ratings per item K for statistically significant AI evaluations.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That datasets containing large numbers of ratings per item together with persistent rater identifiers are available, representative of typical evaluation settings, and that multi-level bootstrapping accurately captures real annotator variance without introducing new artifacts.","pith_extraction_headline":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations."},"references":{"count":31,"sample":[{"doi":"","year":2023,"title":"Reproducibility Checklist , 2023","work_id":"26257940-2fdb-4667-aabd-1b0413250d25","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"ACL Rolling Review , 2024","work_id":"9aeb7e45-88dd-4c6f-904a-023bc4a47929","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Dices dataset: Diversity in conversational ai evaluation for safety","work_id":"ab6bebdb-c282-4538-83a1-4a6a1fbcf56f","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1038/533452a","year":2016,"title":"2016, Nature, 533, 452, doi: 10.1038/533452a","work_id":"dd847cd4-6c96-4f41-8630-3e4fc3ef24c7","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Toward benchmarking group explanations: Evaluating the effect of aggregation strategies versus explanation","work_id":"c44e8f13-a78c-45e8-897d-07f86d0ceed3","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":31,"snapshot_sha256":"c5d058b2598e5e56392ff6414b2813e24a4864fbf5d68b188197451710dc8979","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13801","created_at":"2026-05-18T02:44:15.506094+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13801v1","created_at":"2026-05-18T02:44:15.506094+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13801","created_at":"2026-05-18T02:44:15.506094+00:00"},{"alias_kind":"pith_short_12","alias_value":"Q2XUZPOLM4F3","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"Q2XUZPOLM4F32YDX","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"Q2XUZPOL","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG","json":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG.json","graph_json":"https://pith.science/api/pith-number/Q2XUZPOLM4F32YDX64DCHRK4EG/graph.json","events_json":"https://pith.science/api/pith-number/Q2XUZPOLM4F32YDX64DCHRK4EG/events.json","paper":"https://pith.science/paper/Q2XUZPOL"},"agent_actions":{"view_html":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG","download_json":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG.json","view_paper":"https://pith.science/paper/Q2XUZPOL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13801&json=true","fetch_graph":"https://pith.science/api/pith-number/Q2XUZPOLM4F32YDX64DCHRK4EG/graph.json","fetch_events":"https://pith.science/api/pith-number/Q2XUZPOLM4F32YDX64DCHRK4EG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG/action/storage_attestation","attest_author":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG/action/author_attestation","sign_citation":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG/action/citation_signature","submit_replication":"https://pith.science/pith/Q2XUZPOLM4F32YDX64DCHRK4EG/action/replication_record"}},"created_at":"2026-05-18T02:44:15.506094+00:00","updated_at":"2026-05-18T02:44:15.506094+00:00"}