{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:KPSV2JNCBFXE5FIFZ654DOPORJ","short_pith_number":"pith:KPSV2JNC","schema_version":"1.0","canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","source":{"kind":"arxiv","id":"2605.24213","version":1},"attestation_state":"computed","paper":{"title":"Towards Evaluation Engineering: An Empirical Study of ML Evaluation Harnesses in the Wild","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.SE","authors_text":"Abdul Ali Bangash, Ahmed E. Hassan, Bram Adams, Zehao Wang, Zhimin Zhao","submitted_at":"2026-05-22T20:54:30Z","abstract_excerpt":"Evaluation harnesses are software systems that orchestrate model evaluation by managing model invocation, data loading, metric computation, and result reporting. Despite their critical role in machine learning infrastructure, their operational challenges and engineering concerns have received limited attention so far. We present an empirical study of 57 evaluation harnesses, deriving a five-stage harness model and classifying 16,560 issues by workflow stage and root cause. Most harness operational challenges concentrate in the Specification stage (41.4% of issues), where harnesses integrate ex"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.24213","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SE","submitted_at":"2026-05-22T20:54:30Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"53dbe4c5143f4a4af9cfadfb559c2cf9cb863f83fc13bd62fdbbee5ea564e7df","abstract_canon_sha256":"fb95cbae53ec74fefa82173ea22d3554b0134974b780e58e76a235059a8ae229"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T01:02:52.749639Z","signature_b64":"j9zyeRTmp3AOL6QoexwnU2YsYJnf4W92ipFkS0VL5+VIxAXn6Bt2V16OFjQUsVnrgekSZCp8w2aYaiZcZQYLBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"53e55d25a2096e4e9505cfbbc1b9ee8a661286a10a00b029ce84d7fe2bb5e0db","last_reissued_at":"2026-05-26T01:02:52.748954Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T01:02:52.748954Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Towards Evaluation Engineering: An Empirical Study of ML Evaluation Harnesses in the Wild","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.SE","authors_text":"Abdul Ali Bangash, Ahmed E. Hassan, Bram Adams, Zehao Wang, Zhimin Zhao","submitted_at":"2026-05-22T20:54:30Z","abstract_excerpt":"Evaluation harnesses are software systems that orchestrate model evaluation by managing model invocation, data loading, metric computation, and result reporting. Despite their critical role in machine learning infrastructure, their operational challenges and engineering concerns have received limited attention so far. We present an empirical study of 57 evaluation harnesses, deriving a five-stage harness model and classifying 16,560 issues by workflow stage and root cause. Most harness operational challenges concentrate in the Specification stage (41.4% of issues), where harnesses integrate ex"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.24213","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.24213/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.24213","created_at":"2026-05-26T01:02:52.749059+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.24213v1","created_at":"2026-05-26T01:02:52.749059+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.24213","created_at":"2026-05-26T01:02:52.749059+00:00"},{"alias_kind":"pith_short_12","alias_value":"KPSV2JNCBFXE","created_at":"2026-05-26T01:02:52.749059+00:00"},{"alias_kind":"pith_short_16","alias_value":"KPSV2JNCBFXE5FIF","created_at":"2026-05-26T01:02:52.749059+00:00"},{"alias_kind":"pith_short_8","alias_value":"KPSV2JNC","created_at":"2026-05-26T01:02:52.749059+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ","json":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ.json","graph_json":"https://pith.science/api/pith-number/KPSV2JNCBFXE5FIFZ654DOPORJ/graph.json","events_json":"https://pith.science/api/pith-number/KPSV2JNCBFXE5FIFZ654DOPORJ/events.json","paper":"https://pith.science/paper/KPSV2JNC"},"agent_actions":{"view_html":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ","download_json":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ.json","view_paper":"https://pith.science/paper/KPSV2JNC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.24213&json=true","fetch_graph":"https://pith.science/api/pith-number/KPSV2JNCBFXE5FIFZ654DOPORJ/graph.json","fetch_events":"https://pith.science/api/pith-number/KPSV2JNCBFXE5FIFZ654DOPORJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/action/storage_attestation","attest_author":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/action/author_attestation","sign_citation":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/action/citation_signature","submit_replication":"https://pith.science/pith/KPSV2JNCBFXE5FIFZ654DOPORJ/action/replication_record"}},"created_at":"2026-05-26T01:02:52.749059+00:00","updated_at":"2026-05-26T01:02:52.749059+00:00"}