{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:KPSV2JNCBFXE5FIFZ654DOPORJ","short_pith_number":"pith:KPSV2JNC","canonical_record":{"source":{"id":"2605.24213","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-22T20:54:30Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"53dbe4c5143f4a4af9cfadfb559c2cf9cb863f83fc13bd62fdbbee5ea564e7df","abstract_canon_sha256":"fb95cbae53ec74fefa82173ea22d3554b0134974b780e58e76a235059a8ae229"},"schema_version":"1.0"},"canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","source":{"kind":"arxiv","id":"2605.24213","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.24213","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"arxiv_version","alias_value":"2605.24213v1","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.24213","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"pith_short_12","alias_value":"KPSV2JNCBFXE","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"pith_short_16","alias_value":"KPSV2JNCBFXE5FIF","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"pith_short_8","alias_value":"KPSV2JNC","created_at":"2026-05-26T01:02:52Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:KPSV2JNCBFXE5FIFZ654DOPORJ","target":"record","payload":{"canonical_record":{"source":{"id":"2605.24213","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-22T20:54:30Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"53dbe4c5143f4a4af9cfadfb559c2cf9cb863f83fc13bd62fdbbee5ea564e7df","abstract_canon_sha256":"fb95cbae53ec74fefa82173ea22d3554b0134974b780e58e76a235059a8ae229"},"schema_version":"1.0"},"canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T01:02:52.749639Z","signature_b64":"j9zyeRTmp3AOL6QoexwnU2YsYJnf4W92ipFkS0VL5+VIxAXn6Bt2V16OFjQUsVnrgekSZCp8w2aYaiZcZQYLBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","last_reissued_at":"2026-05-26T01:02:52.748954Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T01:02:52.748954Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.24213","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T01:02:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bjdeox/ddNMnLQo/v7xS8tTMmzuhcQSXYdgB62hQrTeRTA+n3R/CEf27Flw2c8xRoDJYrvMFOqgPch8eRbEACg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T01:44:40.394289Z"},"content_sha256":"547172bd92e28b56ed8cdeb06857d4d0ab7c1d00c312ce4d3ada29b91da2ab1c","schema_version":"1.0","event_id":"sha256:547172bd92e28b56ed8cdeb06857d4d0ab7c1d00c312ce4d3ada29b91da2ab1c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:KPSV2JNCBFXE5FIFZ654DOPORJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Towards Evaluation Engineering: An Empirical Study of ML Evaluation Harnesses in the Wild","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.SE","authors_text":"Abdul Ali Bangash, Ahmed E. Hassan, Bram Adams, Zehao Wang, Zhimin Zhao","submitted_at":"2026-05-22T20:54:30Z","abstract_excerpt":"Evaluation harnesses are software systems that orchestrate model evaluation by managing model invocation, data loading, metric computation, and result reporting. Despite their critical role in machine learning infrastructure, their operational challenges and engineering concerns have received limited attention so far. We present an empirical study of 57 evaluation harnesses, deriving a five-stage harness model and classifying 16,560 issues by workflow stage and root cause. Most harness operational challenges concentrate in the Specification stage (41.4% of issues), where harnesses integrate ex"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.24213","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.24213/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T01:02:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xkfvpXlDT2UVexVd7WJuQ9DmnEanL3qQobgQe+yLAqCdF//3PDzKkp25N8is47wm2O8hKR97S80iniu7V3UMDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T01:44:40.394693Z"},"content_sha256":"ef298505d9d61e57b925eb42aa7e58fc1b6508ed8f75ea806719551446f077e5","schema_version":"1.0","event_id":"sha256:ef298505d9d61e57b925eb42aa7e58fc1b6508ed8f75ea806719551446f077e5"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/bundle.json","state_url":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T01:44:40Z","links":{"resolver":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ","bundle":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/bundle.json","state":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:KPSV2JNCBFXE5FIFZ654DOPORJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fb95cbae53ec74fefa82173ea22d3554b0134974b780e58e76a235059a8ae229","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-22T20:54:30Z","title_canon_sha256":"53dbe4c5143f4a4af9cfadfb559c2cf9cb863f83fc13bd62fdbbee5ea564e7df"},"schema_version":"1.0","source":{"id":"2605.24213","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.24213","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"arxiv_version","alias_value":"2605.24213v1","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.24213","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"pith_short_12","alias_value":"KPSV2JNCBFXE","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"pith_short_16","alias_value":"KPSV2JNCBFXE5FIF","created_at":"2026-05-26T01:02:52Z"},{"alias_kind":"pith_short_8","alias_value":"KPSV2JNC","created_at":"2026-05-26T01:02:52Z"}],"graph_snapshots":[{"event_id":"sha256:ef298505d9d61e57b925eb42aa7e58fc1b6508ed8f75ea806719551446f077e5","target":"graph","created_at":"2026-05-26T01:02:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.24213/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Evaluation harnesses are software systems that orchestrate model evaluation by managing model invocation, data loading, metric computation, and result reporting. Despite their critical role in machine learning infrastructure, their operational challenges and engineering concerns have received limited attention so far. We present an empirical study of 57 evaluation harnesses, deriving a five-stage harness model and classifying 16,560 issues by workflow stage and root cause. Most harness operational challenges concentrate in the Specification stage (41.4% of issues), where harnesses integrate ex","authors_text":"Abdul Ali Bangash, Ahmed E. Hassan, Bram Adams, Zehao Wang, Zhimin Zhao","cross_cats":["cs.AI","cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-22T20:54:30Z","title":"Towards Evaluation Engineering: An Empirical Study of ML Evaluation Harnesses in the Wild"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.24213","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:547172bd92e28b56ed8cdeb06857d4d0ab7c1d00c312ce4d3ada29b91da2ab1c","target":"record","created_at":"2026-05-26T01:02:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fb95cbae53ec74fefa82173ea22d3554b0134974b780e58e76a235059a8ae229","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-22T20:54:30Z","title_canon_sha256":"53dbe4c5143f4a4af9cfadfb559c2cf9cb863f83fc13bd62fdbbee5ea564e7df"},"schema_version":"1.0","source":{"id":"2605.24213","kind":"arxiv","version":1}},"canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","first_computed_at":"2026-05-26T01:02:52.748954Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-26T01:02:52.748954Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"j9zyeRTmp3AOL6QoexwnU2YsYJnf4W92ipFkS0VL5+VIxAXn6Bt2V16OFjQUsVnrgekSZCp8w2aYaiZcZQYLBw==","signature_status":"signed_v1","signed_at":"2026-05-26T01:02:52.749639Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.24213","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:547172bd92e28b56ed8cdeb06857d4d0ab7c1d00c312ce4d3ada29b91da2ab1c","sha256:ef298505d9d61e57b925eb42aa7e58fc1b6508ed8f75ea806719551446f077e5"],"state_sha256":"a1587e7af701dc3076573d683879a3acfafe268f326db5eff42a9639d5671276"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OfMLgUxWXpiQrIUDnVPKAgZGrbg0/5o0Xegfa7mxB4kFYe1HFVbpCBBBbPim/yY7He2IwBOWrv9+G9AwBLZ+Bg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T01:44:40.397089Z","bundle_sha256":"10b462eecc4cfd1322a805dcaf82e09d979cb121538e12c25dde32b66cd702a2"}}