{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:VXUT3H5AOTJAF26DH44ZS6OAS4","short_pith_number":"pith:VXUT3H5A","schema_version":"1.0","canonical_sha256":"ade93d9fa074d202ebc33f399979c09718aa3f7cba6804482bcfd3d55c15a0eb","source":{"kind":"arxiv","id":"2601.11895","version":3},"attestation_state":"computed","paper":{"title":"DevBench: A Realistic, Developer-Informed Benchmark for Code Generation Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.SE"],"primary_cat":"cs.LG","authors_text":"Adarsh Kumarappan, Elsie Nallipogu, Gabriel Ryan, Pareesa Ameneh Golnari, Shengyu Fu, Wen Wen, Xiaoyu Liu, Yuting Sun","submitted_at":"2026-01-17T03:33:08Z","abstract_excerpt":"DevBench is a telemetry-driven benchmark designed to evaluate Large Language Models (LLMs) on realistic code completion tasks. It includes 1,800 evaluation instances across six programming languages and six task categories derived from real developer telemetry and synthesized using generator models from multiple provider families to mitigate single-source bias. Unlike prior benchmarks, it emphasizes ecological validity, avoids training data contamination, and enables detailed diagnostics. The evaluation combines functional correctness, similarity-based metrics, and LLM-judge assessments focuse"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.11895","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-01-17T03:33:08Z","cross_cats_sorted":["cs.AI","cs.SE"],"title_canon_sha256":"dcb73ed902a55b836a0f4779ed74db9c75eb998d47bb8fa91541c6b7ea7ef19a","abstract_canon_sha256":"4b13241e92f607e0fcb5e81b917551c68e38593ca0cae8c88ae6d4e52f02ec10"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:02.374866Z","signature_b64":"aQp6Hi2TOOmMB0uKNZziFD7icrmV3Oe5MwEdyvP9HK0nfqicIAJub5AIuDXF+gyFP+pyFcIOA1acGlyz+zIUCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ade93d9fa074d202ebc33f399979c09718aa3f7cba6804482bcfd3d55c15a0eb","last_reissued_at":"2026-05-20T00:03:02.374065Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:02.374065Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DevBench: A Realistic, Developer-Informed Benchmark for Code Generation Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.SE"],"primary_cat":"cs.LG","authors_text":"Adarsh Kumarappan, Elsie Nallipogu, Gabriel Ryan, Pareesa Ameneh Golnari, Shengyu Fu, Wen Wen, Xiaoyu Liu, Yuting Sun","submitted_at":"2026-01-17T03:33:08Z","abstract_excerpt":"DevBench is a telemetry-driven benchmark designed to evaluate Large Language Models (LLMs) on realistic code completion tasks. It includes 1,800 evaluation instances across six programming languages and six task categories derived from real developer telemetry and synthesized using generator models from multiple provider families to mitigate single-source bias. Unlike prior benchmarks, it emphasizes ecological validity, avoids training data contamination, and enables detailed diagnostics. The evaluation combines functional correctness, similarity-based metrics, and LLM-judge assessments focuse"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.11895","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.11895/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.11895","created_at":"2026-05-20T00:03:02.374190+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.11895v3","created_at":"2026-05-20T00:03:02.374190+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.11895","created_at":"2026-05-20T00:03:02.374190+00:00"},{"alias_kind":"pith_short_12","alias_value":"VXUT3H5AOTJA","created_at":"2026-05-20T00:03:02.374190+00:00"},{"alias_kind":"pith_short_16","alias_value":"VXUT3H5AOTJAF26D","created_at":"2026-05-20T00:03:02.374190+00:00"},{"alias_kind":"pith_short_8","alias_value":"VXUT3H5A","created_at":"2026-05-20T00:03:02.374190+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.21384","citing_title":"SpecBench: Measuring Reward Hacking in Long-Horizon Coding Agents","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13139","citing_title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle","ref_index":14,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4","json":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4.json","graph_json":"https://pith.science/api/pith-number/VXUT3H5AOTJAF26DH44ZS6OAS4/graph.json","events_json":"https://pith.science/api/pith-number/VXUT3H5AOTJAF26DH44ZS6OAS4/events.json","paper":"https://pith.science/paper/VXUT3H5A"},"agent_actions":{"view_html":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4","download_json":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4.json","view_paper":"https://pith.science/paper/VXUT3H5A","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.11895&json=true","fetch_graph":"https://pith.science/api/pith-number/VXUT3H5AOTJAF26DH44ZS6OAS4/graph.json","fetch_events":"https://pith.science/api/pith-number/VXUT3H5AOTJAF26DH44ZS6OAS4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4/action/storage_attestation","attest_author":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4/action/author_attestation","sign_citation":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4/action/citation_signature","submit_replication":"https://pith.science/pith/VXUT3H5AOTJAF26DH44ZS6OAS4/action/replication_record"}},"created_at":"2026-05-20T00:03:02.374190+00:00","updated_at":"2026-05-20T00:03:02.374190+00:00"}