{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:QIAYVDZQMYBWLTOL4RQNEUEWRW","short_pith_number":"pith:QIAYVDZQ","canonical_record":{"source":{"id":"2605.20520","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32Z","cross_cats_sorted":[],"title_canon_sha256":"3e2bb5b85f5e86ddd7423f38c6ed5c7f7b46d2a62840c2bd0b150c9c6181cdb3","abstract_canon_sha256":"12d078cc3652f496c36b2ce6f963599fe8e583753f8a6a9c0c87492c9476387a"},"schema_version":"1.0"},"canonical_sha256":"82018a8f30660365cdcbe460d250968daf955b61714ae1bcf3207b61bb263f49","source":{"kind":"arxiv","id":"2605.20520","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.20520","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"arxiv_version","alias_value":"2605.20520v1","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.20520","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"pith_short_12","alias_value":"QIAYVDZQMYBW","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"pith_short_16","alias_value":"QIAYVDZQMYBWLTOL","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"pith_short_8","alias_value":"QIAYVDZQ","created_at":"2026-05-21T01:04:40Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:QIAYVDZQMYBWLTOL4RQNEUEWRW","target":"record","payload":{"canonical_record":{"source":{"id":"2605.20520","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32Z","cross_cats_sorted":[],"title_canon_sha256":"3e2bb5b85f5e86ddd7423f38c6ed5c7f7b46d2a62840c2bd0b150c9c6181cdb3","abstract_canon_sha256":"12d078cc3652f496c36b2ce6f963599fe8e583753f8a6a9c0c87492c9476387a"},"schema_version":"1.0"},"canonical_sha256":"82018a8f30660365cdcbe460d250968daf955b61714ae1bcf3207b61bb263f49","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T01:04:40.814704Z","signature_b64":"CYOe8wBkvG2fHOLYwnFjxjGjWKu2hm5U95B42BLN1C9TMnkF75Vz6mNPMFGnWUahW2mbP78hMz2eVLDVmAhVAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"82018a8f30660365cdcbe460d250968daf955b61714ae1bcf3207b61bb263f49","last_reissued_at":"2026-05-21T01:04:40.814265Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T01:04:40.814265Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.20520","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:04:40Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tQaULt30lghE50fP8w887LZODV8L99d/q06a0AlndP39bQO8EgZwExFMWVwtYHfSiDJ8ByY2PL909n4GvKAmDw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T18:14:43.890653Z"},"content_sha256":"873ebb979516959953dc75da4f5c2a943bf58977e3e0bb7cbc884387b85227d4","schema_version":"1.0","event_id":"sha256:873ebb979516959953dc75da4f5c2a943bf58977e3e0bb7cbc884387b85227d4"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:QIAYVDZQMYBWLTOL4RQNEUEWRW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Open-World Evaluations for Measuring Frontier AI Capabilities","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Andrew B. Hall, Andrew Schwartz, Arvind Narayanan, Cozmin Ududec, Dimitris Papailiopoulos, Gillian K Hadfield, Harry Coppock, Helen Toner, J.J. Allaire, Magda Dubois, Peter Kirgis, Rishi Bommasani, Sara Hooker, Sayash Kapoor, Seth Lazar, Shoshannah Tekofsky, Stephan Rabanser, Steve Newman","submitted_at":"2026-05-19T21:42:32Z","abstract_excerpt":"Benchmark-based evaluation remains important for tracking frontier AI progress. But it can both overstate and understate deployed capability because it privileges tasks that can be precisely specified, automatically graded, easy to optimize for, and run with low budgets and short time horizons. We advocate for a complementary class of evaluations, which we term open-world evaluations: long-horizon, messy, real-world tasks assessed through small-sample qualitative analysis rather than benchmark-scale automation. In this paper we survey recent open-world evaluations, identify their strengths and"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.20520","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.20520/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-21T01:04:40Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"rMGRjEwbQ7Y8HTuASvv8mNpjCkR+C3xzqMnDVfEtjyAHLxtdVgZDIlNJZFE/94MEXgHLaADOq7iMpfv56ImkCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-24T18:14:43.891416Z"},"content_sha256":"46ce094e29b396756e23d27f9910e6fca2c26b0b20a42df456574b58f731878e","schema_version":"1.0","event_id":"sha256:46ce094e29b396756e23d27f9910e6fca2c26b0b20a42df456574b58f731878e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW/bundle.json","state_url":"https://pith.science/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-24T18:14:43Z","links":{"resolver":"https://pith.science/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW","bundle":"https://pith.science/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW/bundle.json","state":"https://pith.science/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/QIAYVDZQMYBWLTOL4RQNEUEWRW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:QIAYVDZQMYBWLTOL4RQNEUEWRW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"12d078cc3652f496c36b2ce6f963599fe8e583753f8a6a9c0c87492c9476387a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32Z","title_canon_sha256":"3e2bb5b85f5e86ddd7423f38c6ed5c7f7b46d2a62840c2bd0b150c9c6181cdb3"},"schema_version":"1.0","source":{"id":"2605.20520","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.20520","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"arxiv_version","alias_value":"2605.20520v1","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.20520","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"pith_short_12","alias_value":"QIAYVDZQMYBW","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"pith_short_16","alias_value":"QIAYVDZQMYBWLTOL","created_at":"2026-05-21T01:04:40Z"},{"alias_kind":"pith_short_8","alias_value":"QIAYVDZQ","created_at":"2026-05-21T01:04:40Z"}],"graph_snapshots":[{"event_id":"sha256:46ce094e29b396756e23d27f9910e6fca2c26b0b20a42df456574b58f731878e","target":"graph","created_at":"2026-05-21T01:04:40Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.20520/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Benchmark-based evaluation remains important for tracking frontier AI progress. But it can both overstate and understate deployed capability because it privileges tasks that can be precisely specified, automatically graded, easy to optimize for, and run with low budgets and short time horizons. We advocate for a complementary class of evaluations, which we term open-world evaluations: long-horizon, messy, real-world tasks assessed through small-sample qualitative analysis rather than benchmark-scale automation. In this paper we survey recent open-world evaluations, identify their strengths and","authors_text":"Andrew B. Hall, Andrew Schwartz, Arvind Narayanan, Cozmin Ududec, Dimitris Papailiopoulos, Gillian K Hadfield, Harry Coppock, Helen Toner, J.J. Allaire, Magda Dubois, Peter Kirgis, Rishi Bommasani, Sara Hooker, Sayash Kapoor, Seth Lazar, Shoshannah Tekofsky, Stephan Rabanser, Steve Newman","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32Z","title":"Open-World Evaluations for Measuring Frontier AI Capabilities"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.20520","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:873ebb979516959953dc75da4f5c2a943bf58977e3e0bb7cbc884387b85227d4","target":"record","created_at":"2026-05-21T01:04:40Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"12d078cc3652f496c36b2ce6f963599fe8e583753f8a6a9c0c87492c9476387a","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-19T21:42:32Z","title_canon_sha256":"3e2bb5b85f5e86ddd7423f38c6ed5c7f7b46d2a62840c2bd0b150c9c6181cdb3"},"schema_version":"1.0","source":{"id":"2605.20520","kind":"arxiv","version":1}},"canonical_sha256":"82018a8f30660365cdcbe460d250968daf955b61714ae1bcf3207b61bb263f49","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"82018a8f30660365cdcbe460d250968daf955b61714ae1bcf3207b61bb263f49","first_computed_at":"2026-05-21T01:04:40.814265Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-21T01:04:40.814265Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"CYOe8wBkvG2fHOLYwnFjxjGjWKu2hm5U95B42BLN1C9TMnkF75Vz6mNPMFGnWUahW2mbP78hMz2eVLDVmAhVAw==","signature_status":"signed_v1","signed_at":"2026-05-21T01:04:40.814704Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.20520","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:873ebb979516959953dc75da4f5c2a943bf58977e3e0bb7cbc884387b85227d4","sha256:46ce094e29b396756e23d27f9910e6fca2c26b0b20a42df456574b58f731878e"],"state_sha256":"d51bbf1d986c5e319d68e4dbc78002f422ddfea3824ef4fa6dff3829eb7daeca"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kqstvT8SBU1r8ylLEhICufPGz2rsNKC872YgjckDiEjRmB6EZr0cwAbbmepB+Fze//Jzhmn53xsw4EHXcnQpAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-24T18:14:43.895682Z","bundle_sha256":"83c8e4815be7ec0a92062832ed8481df9b7542641670ab5f4777f18beb90cc6b"}}