{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:WBZXHBKG7DSLL37MCHPZQDSESQ","short_pith_number":"pith:WBZXHBKG","schema_version":"1.0","canonical_sha256":"b073738546f8e4b5efec11df980e449401fbf9f08792a97a13ad08213346d027","source":{"kind":"arxiv","id":"2512.04111","version":2},"attestation_state":"computed","paper":{"title":"HAI-Eval: Measuring Human-AI Synergy in Collaborative Coding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.HC"],"primary_cat":"cs.SE","authors_text":"Bingduo Liao, Chiming Ni, Hanan Salam, Hanjun Luo, Jiaheng Wen, Sylvia Chung, Wenyuan Xu, Xiaofeng Wang, Xinfeng Li, Yingbin Jin, Yiran Wang, Zhimu Huang","submitted_at":"2025-11-30T21:44:44Z","abstract_excerpt":"LLM-powered coding agents are reshaping the development paradigm. However, existing evaluation systems, neither traditional tests for humans nor benchmarks for LLMs, fail to capture this shift. They remain focused on well-defined algorithmic problems, which excludes problems where success depends on human-AI collaboration. Such collaborative problems not only require human reasoning to interpret complex contexts and guide solution strategies, but also demand AI efficiency for implementation. To bridge this gap, we introduce HAI-Eval, a unified benchmark designed to measure the synergy of human"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.04111","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2025-11-30T21:44:44Z","cross_cats_sorted":["cs.AI","cs.HC"],"title_canon_sha256":"4c17da2c2d818dea17e39a72d2a7628cb1d1c93bbd2fdb3501ea68f13f37d675","abstract_canon_sha256":"d265cda1a06abc4355a18af72ec4ae9a252c5998412c69333cc2f79357abba00"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:29.370502Z","signature_b64":"f3TDvrgUGui9lKchp8AKkWJuVxLSM8yajs4wGdVEVVMQ8NTqNdog6Ir80/f/bp2PGfbZSulTPUJmTpPQ6HUjCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b073738546f8e4b5efec11df980e449401fbf9f08792a97a13ad08213346d027","last_reissued_at":"2026-05-20T00:00:29.369950Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:29.369950Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"HAI-Eval: Measuring Human-AI Synergy in Collaborative Coding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.HC"],"primary_cat":"cs.SE","authors_text":"Bingduo Liao, Chiming Ni, Hanan Salam, Hanjun Luo, Jiaheng Wen, Sylvia Chung, Wenyuan Xu, Xiaofeng Wang, Xinfeng Li, Yingbin Jin, Yiran Wang, Zhimu Huang","submitted_at":"2025-11-30T21:44:44Z","abstract_excerpt":"LLM-powered coding agents are reshaping the development paradigm. However, existing evaluation systems, neither traditional tests for humans nor benchmarks for LLMs, fail to capture this shift. They remain focused on well-defined algorithmic problems, which excludes problems where success depends on human-AI collaboration. Such collaborative problems not only require human reasoning to interpret complex contexts and guide solution strategies, but also demand AI efficiency for implementation. To bridge this gap, we introduce HAI-Eval, a unified benchmark designed to measure the synergy of human"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.04111","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.04111/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.04111","created_at":"2026-05-20T00:00:29.370042+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.04111v2","created_at":"2026-05-20T00:00:29.370042+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.04111","created_at":"2026-05-20T00:00:29.370042+00:00"},{"alias_kind":"pith_short_12","alias_value":"WBZXHBKG7DSL","created_at":"2026-05-20T00:00:29.370042+00:00"},{"alias_kind":"pith_short_16","alias_value":"WBZXHBKG7DSLL37M","created_at":"2026-05-20T00:00:29.370042+00:00"},{"alias_kind":"pith_short_8","alias_value":"WBZXHBKG","created_at":"2026-05-20T00:00:29.370042+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2604.17883","citing_title":"Scaling Human-AI Coding Collaboration Requires a Governable Consensus Layer","ref_index":19,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ","json":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ.json","graph_json":"https://pith.science/api/pith-number/WBZXHBKG7DSLL37MCHPZQDSESQ/graph.json","events_json":"https://pith.science/api/pith-number/WBZXHBKG7DSLL37MCHPZQDSESQ/events.json","paper":"https://pith.science/paper/WBZXHBKG"},"agent_actions":{"view_html":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ","download_json":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ.json","view_paper":"https://pith.science/paper/WBZXHBKG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.04111&json=true","fetch_graph":"https://pith.science/api/pith-number/WBZXHBKG7DSLL37MCHPZQDSESQ/graph.json","fetch_events":"https://pith.science/api/pith-number/WBZXHBKG7DSLL37MCHPZQDSESQ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ/action/storage_attestation","attest_author":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ/action/author_attestation","sign_citation":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ/action/citation_signature","submit_replication":"https://pith.science/pith/WBZXHBKG7DSLL37MCHPZQDSESQ/action/replication_record"}},"created_at":"2026-05-20T00:00:29.370042+00:00","updated_at":"2026-05-20T00:00:29.370042+00:00"}