{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:Q2XUZPOLM4F32YDX64DCHRK4EG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"164a9bef9af553411a717b73363bbad1d623c68a3be64100a6ded7d8d10528c4","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:22:27Z","title_canon_sha256":"a29bc2911f33f194c25dd30689f81a22a966c777e44598867deef27b86c869f5"},"schema_version":"1.0","source":{"id":"2605.13801","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13801","created_at":"2026-05-18T02:44:15Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13801v1","created_at":"2026-05-18T02:44:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13801","created_at":"2026-05-18T02:44:15Z"},{"alias_kind":"pith_short_12","alias_value":"Q2XUZPOLM4F3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"Q2XUZPOLM4F32YDX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"Q2XUZPOL","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:5210d005fef7c7017e5f41a7f34fa247b9cf1b1982bc0ca07b4636f5f8071b67","target":"graph","created_at":"2026-05-18T02:44:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"we introduce a multi-level bootstrapping approach to realistically model annotator behavior. Leveraging datasets with a large number of ratings and persistent rater identifiers, we analyze the tradeoffs between the number of items (N) and the number of responses per item (K) required to achieve statistical significance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That datasets containing large numbers of ratings per item together with persistent rater identifiers are available, representative of typical evaluation settings, and that multi-level bootstrapping accurately captures real annotator variance without introducing new artifacts."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Multi-level bootstrapping models annotator variance using large rater-ID datasets to find optimal tradeoffs between number of items N and ratings per item K for statistically significant AI evaluations."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations."}],"snapshot_sha256":"28769c3d507dcf35a4cdeb68064d5a37a776ea4704da8a5acbe8fb790985dc99"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"As generative AI models such as large language models (LLMs) become more pervasive, ensuring the safety, robustness, and overall trustworthiness of these systems is paramount. However, AI is currently facing a reproducibility crisis driven by unreliable evaluations and unrepeatable experimental results. While human raters are often used to assess models for utility and safety, they introduce divergent biases and subjective opinions into their annotations. Overcoming this variance is exceptionally challenging because very little data exists to study how experimental repeatability actually impro","authors_text":"Christopher M. Homan, Chris Welty, Deepak Pandita, Flip Korn","cross_cats":["cs.AI"],"headline":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:22:27Z","title":"Improving Reproducibility in Evaluation through Multi-Level Annotator Modeling"},"references":{"count":31,"internal_anchors":0,"resolved_work":31,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Reproducibility Checklist , 2023","work_id":"26257940-2fdb-4667-aabd-1b0413250d25","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"ACL Rolling Review , 2024","work_id":"9aeb7e45-88dd-4c6f-904a-023bc4a47929","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Dices dataset: Diversity in conversational ai evaluation for safety","work_id":"ab6bebdb-c282-4538-83a1-4a6a1fbcf56f","year":2023},{"cited_arxiv_id":"","doi":"10.1038/533452a","is_internal_anchor":false,"ref_index":4,"title":"2016, Nature, 533, 452, doi: 10.1038/533452a","work_id":"dd847cd4-6c96-4f41-8630-3e4fc3ef24c7","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Toward benchmarking group explanations: Evaluating the effect of aggregation strategies versus explanation","work_id":"c44e8f13-a78c-45e8-897d-07f86d0ceed3","year":2021}],"snapshot_sha256":"c5d058b2598e5e56392ff6414b2813e24a4864fbf5d68b188197451710dc8979"},"source":{"id":"2605.13801","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:27:32.733664Z","id":"f07a563a-b5a8-4a58-af7d-b6eafe18899f","model_set":{"reader":"grok-4.3"},"one_line_summary":"Multi-level bootstrapping models annotator variance using large rater-ID datasets to find optimal tradeoffs between number of items N and ratings per item K for statistically significant AI evaluations.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multi-level bootstrapping models annotator variance to find the N and K needed for statistically significant evaluations.","strongest_claim":"we introduce a multi-level bootstrapping approach to realistically model annotator behavior. Leveraging datasets with a large number of ratings and persistent rater identifiers, we analyze the tradeoffs between the number of items (N) and the number of responses per item (K) required to achieve statistical significance.","weakest_assumption":"That datasets containing large numbers of ratings per item together with persistent rater identifiers are available, representative of typical evaluation settings, and that multi-level bootstrapping accurately captures real annotator variance without introducing new artifacts."}},"verdict_id":"f07a563a-b5a8-4a58-af7d-b6eafe18899f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:683d1d4559516a4b670fe59be8791ddab51a067a39980ce369d512227cf027a5","target":"record","created_at":"2026-05-18T02:44:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"164a9bef9af553411a717b73363bbad1d623c68a3be64100a6ded7d8d10528c4","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:22:27Z","title_canon_sha256":"a29bc2911f33f194c25dd30689f81a22a966c777e44598867deef27b86c869f5"},"schema_version":"1.0","source":{"id":"2605.13801","kind":"arxiv","version":1}},"canonical_sha256":"86af4cbdcb670bbd6077f70623c55c2190c2155bbf6044378f4e4a4788fa552b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"86af4cbdcb670bbd6077f70623c55c2190c2155bbf6044378f4e4a4788fa552b","first_computed_at":"2026-05-18T02:44:15.506015Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:15.506015Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"+D3An1y7xkmYrpR+SvY3NGrevhLda7pYYKc0RC8HOcw7062GCLLzoZm+YPQmB4HVubp2OZRgVqUlvQKeabwkDg==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:15.506608Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13801","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:683d1d4559516a4b670fe59be8791ddab51a067a39980ce369d512227cf027a5","sha256:5210d005fef7c7017e5f41a7f34fa247b9cf1b1982bc0ca07b4636f5f8071b67"],"state_sha256":"b5d349230fa7f6c0a4bc427542c31ae0483e1656cb2ab5efb273558034dd96f4"}