{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:V3OHEIA55NAVA3SM5QMZOYYTQD","short_pith_number":"pith:V3OHEIA5","canonical_record":{"source":{"id":"2605.17829","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-18T04:03:18Z","cross_cats_sorted":[],"title_canon_sha256":"415195f0529d67471bf175146cdd7cd9a7552d489127bf9de8d1437c58af58c4","abstract_canon_sha256":"fd9fd738a6cc458fa2deb2a6065e8e0b7cb9898fa4df47aa45291b05fba6b684"},"schema_version":"1.0"},"canonical_sha256":"aedc72201deb41506e4cec1997631380d5e942fd4aa24d52a589d935413d02cc","source":{"kind":"arxiv","id":"2605.17829","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.17829","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"arxiv_version","alias_value":"2605.17829v1","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.17829","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"pith_short_12","alias_value":"V3OHEIA55NAV","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"pith_short_16","alias_value":"V3OHEIA55NAVA3SM","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"pith_short_8","alias_value":"V3OHEIA5","created_at":"2026-05-20T00:05:00Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:V3OHEIA55NAVA3SM5QMZOYYTQD","target":"record","payload":{"canonical_record":{"source":{"id":"2605.17829","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-18T04:03:18Z","cross_cats_sorted":[],"title_canon_sha256":"415195f0529d67471bf175146cdd7cd9a7552d489127bf9de8d1437c58af58c4","abstract_canon_sha256":"fd9fd738a6cc458fa2deb2a6065e8e0b7cb9898fa4df47aa45291b05fba6b684"},"schema_version":"1.0"},"canonical_sha256":"aedc72201deb41506e4cec1997631380d5e942fd4aa24d52a589d935413d02cc","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:05:00.564953Z","signature_b64":"qviZVW0k4A+i4NxKgE1FGYc+vlTnoRhUM3iMSR1vylVW5CX2Wk47rMSLDvwSXgjcqNTbI2VPRL0G5o1/OyiQDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"aedc72201deb41506e4cec1997631380d5e942fd4aa24d52a589d935413d02cc","last_reissued_at":"2026-05-20T00:05:00.564153Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:05:00.564153Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.17829","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:05:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YSdN3N4IvDQGW02TAnYqFwCHrBoPQCxn+gQInzpevx9JieTjJK0lhURI6X5n4NOeCl5Y+thkTHrAjJN8c36LBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T18:06:16.752679Z"},"content_sha256":"52877e47e4f4e3da9affc3746816cfda525b8a3d58087be46d4b1a5514989480","schema_version":"1.0","event_id":"sha256:52877e47e4f4e3da9affc3746816cfda525b8a3d58087be46d4b1a5514989480"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:V3OHEIA55NAVA3SM5QMZOYYTQD","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Interactive Evaluation Requires a Design Science","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Adrian Weller, Jiaxin Pei, Jiaxuan You, Keyang Xuan, Manling Li, Pan Lu, Peiyang Song, Pengrui Han, Wenkai Li, Wenyue Hua, Yizhong Wang, Zexue He, Zhenyu Zhang","submitted_at":"2026-05-18T04:03:18Z","abstract_excerpt":"AI evaluation is undergoing a structural change. Large language models (LLMs) are increasingly deployed as systems that act over time through tools, environments, users, and other agents, while many evaluation practices still inherit assumptions from response-centered benchmarks (e.g., fixed inputs, isolated outputs, and outcome judgments that can be made from a single response). The field has begun to build interactive benchmarks, but the resulting landscape is fragmented: benchmarks differ in what interaction artifacts they admit, how trajectories are scored, and what claims their results su"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.17829","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.17829/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:05:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VzvqbU5jQ/Qv89K/27aA+xcgTYE6Pj8w7o43w23eFtTrHqsBukByRhUxFmNLUOdUXaa3FxoGX23iYKyiA6eEAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T18:06:16.753426Z"},"content_sha256":"c678cd7e257127d917e00d7313075d9a86c8ad532ad46de23765e3bc42ed5050","schema_version":"1.0","event_id":"sha256:c678cd7e257127d917e00d7313075d9a86c8ad532ad46de23765e3bc42ed5050"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/V3OHEIA55NAVA3SM5QMZOYYTQD/bundle.json","state_url":"https://pith.science/pith/V3OHEIA55NAVA3SM5QMZOYYTQD/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/V3OHEIA55NAVA3SM5QMZOYYTQD/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-29T18:06:16Z","links":{"resolver":"https://pith.science/pith/V3OHEIA55NAVA3SM5QMZOYYTQD","bundle":"https://pith.science/pith/V3OHEIA55NAVA3SM5QMZOYYTQD/bundle.json","state":"https://pith.science/pith/V3OHEIA55NAVA3SM5QMZOYYTQD/state.json","well_known_bundle":"https://pith.science/.well-known/pith/V3OHEIA55NAVA3SM5QMZOYYTQD/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:V3OHEIA55NAVA3SM5QMZOYYTQD","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fd9fd738a6cc458fa2deb2a6065e8e0b7cb9898fa4df47aa45291b05fba6b684","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-18T04:03:18Z","title_canon_sha256":"415195f0529d67471bf175146cdd7cd9a7552d489127bf9de8d1437c58af58c4"},"schema_version":"1.0","source":{"id":"2605.17829","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.17829","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"arxiv_version","alias_value":"2605.17829v1","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.17829","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"pith_short_12","alias_value":"V3OHEIA55NAV","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"pith_short_16","alias_value":"V3OHEIA55NAVA3SM","created_at":"2026-05-20T00:05:00Z"},{"alias_kind":"pith_short_8","alias_value":"V3OHEIA5","created_at":"2026-05-20T00:05:00Z"}],"graph_snapshots":[{"event_id":"sha256:c678cd7e257127d917e00d7313075d9a86c8ad532ad46de23765e3bc42ed5050","target":"graph","created_at":"2026-05-20T00:05:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.17829/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"AI evaluation is undergoing a structural change. Large language models (LLMs) are increasingly deployed as systems that act over time through tools, environments, users, and other agents, while many evaluation practices still inherit assumptions from response-centered benchmarks (e.g., fixed inputs, isolated outputs, and outcome judgments that can be made from a single response). The field has begun to build interactive benchmarks, but the resulting landscape is fragmented: benchmarks differ in what interaction artifacts they admit, how trajectories are scored, and what claims their results su","authors_text":"Adrian Weller, Jiaxin Pei, Jiaxuan You, Keyang Xuan, Manling Li, Pan Lu, Peiyang Song, Pengrui Han, Wenkai Li, Wenyue Hua, Yizhong Wang, Zexue He, Zhenyu Zhang","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-18T04:03:18Z","title":"Interactive Evaluation Requires a Design Science"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.17829","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:52877e47e4f4e3da9affc3746816cfda525b8a3d58087be46d4b1a5514989480","target":"record","created_at":"2026-05-20T00:05:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fd9fd738a6cc458fa2deb2a6065e8e0b7cb9898fa4df47aa45291b05fba6b684","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-18T04:03:18Z","title_canon_sha256":"415195f0529d67471bf175146cdd7cd9a7552d489127bf9de8d1437c58af58c4"},"schema_version":"1.0","source":{"id":"2605.17829","kind":"arxiv","version":1}},"canonical_sha256":"aedc72201deb41506e4cec1997631380d5e942fd4aa24d52a589d935413d02cc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"aedc72201deb41506e4cec1997631380d5e942fd4aa24d52a589d935413d02cc","first_computed_at":"2026-05-20T00:05:00.564153Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:05:00.564153Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"qviZVW0k4A+i4NxKgE1FGYc+vlTnoRhUM3iMSR1vylVW5CX2Wk47rMSLDvwSXgjcqNTbI2VPRL0G5o1/OyiQDA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:05:00.564953Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.17829","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:52877e47e4f4e3da9affc3746816cfda525b8a3d58087be46d4b1a5514989480","sha256:c678cd7e257127d917e00d7313075d9a86c8ad532ad46de23765e3bc42ed5050"],"state_sha256":"65df286c1d5e4f25864309d27677439b34249d02e7d885d02338f9932c85fa28"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"vI1zu3lnnvGmcby4/S2mgPDMf45maNJ2aq/2vdDvU6cyIGVD0jl7TPsnIAF6FPezbixpLphM8h7auK4gFfJiAA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-29T18:06:16.756415Z","bundle_sha256":"506054438c527d75d3c63f1219de53fb2e4fece084c0f859c1ae7818d354b88d"}}