{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:C6BFU545DD7RW6PMM2VTJX7B2C","short_pith_number":"pith:C6BFU545","schema_version":"1.0","canonical_sha256":"17825a779d18ff1b79ec66ab34dfe1d08698c6913c9055352154a735876fb74c","source":{"kind":"arxiv","id":"2510.04374","version":1},"attestation_state":"computed","paper":{"title":"GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Frontier AI models approach industry experts in quality on real-world economically valuable tasks.","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.LG","authors_text":"Alexandra Barr, Amelia Glaese, David Li, Elizabeth Proehl, Gildas Chabot, Grace Kim, Jerry Tworek, Laurance Fauconnet, Marwan Aljubeh, Michael Sharman, Michele Wang, Natalie S. Kim, Olivia Watkins, Patrick Chao, Phoebe Thacker, Rachel Dias, Samuel Miserendino, Sim\\'on Posada Fishman, Tejal Patwardhan","submitted_at":"2025-10-05T21:36:43Z","abstract_excerpt":"We introduce GDPval, a benchmark evaluating AI model capabilities on real-world economically valuable tasks. GDPval covers the majority of U.S. Bureau of Labor Statistics Work Activities for 44 occupations across the top 9 sectors contributing to U.S. GDP (Gross Domestic Product). Tasks are constructed from the representative work of industry professionals with an average of 14 years of experience. We find that frontier model performance on GDPval is improving roughly linearly over time, and that the current best frontier models are approaching industry experts in deliverable quality. We analy"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":true},"canonical_record":{"source":{"id":"2510.04374","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-10-05T21:36:43Z","cross_cats_sorted":["cs.AI","cs.CY"],"title_canon_sha256":"98ad289b766a236ffb8e8459c62ec63bee942b19ec85473290763a0fb9ed4d41","abstract_canon_sha256":"3e907bdaf254be3ce564273e978e1bc5f5a1f12490ec69fc10eedd5df91fda4c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.067227Z","signature_b64":"VVtcqc3AHiINtBCNIBbTfRE+Iutnyl/lnATZXqqKKpku2UVVcqmz9m6ZYiP3f4j3BYlUsDj0nHO0BE2KHtpgCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"17825a779d18ff1b79ec66ab34dfe1d08698c6913c9055352154a735876fb74c","last_reissued_at":"2026-05-17T23:38:48.066776Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.066776Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Frontier AI models approach industry experts in quality on real-world economically valuable tasks.","cross_cats":["cs.AI","cs.CY"],"primary_cat":"cs.LG","authors_text":"Alexandra Barr, Amelia Glaese, David Li, Elizabeth Proehl, Gildas Chabot, Grace Kim, Jerry Tworek, Laurance Fauconnet, Marwan Aljubeh, Michael Sharman, Michele Wang, Natalie S. Kim, Olivia Watkins, Patrick Chao, Phoebe Thacker, Rachel Dias, Samuel Miserendino, Sim\\'on Posada Fishman, Tejal Patwardhan","submitted_at":"2025-10-05T21:36:43Z","abstract_excerpt":"We introduce GDPval, a benchmark evaluating AI model capabilities on real-world economically valuable tasks. GDPval covers the majority of U.S. Bureau of Labor Statistics Work Activities for 44 occupations across the top 9 sectors contributing to U.S. GDP (Gross Domestic Product). Tasks are constructed from the representative work of industry professionals with an average of 14 years of experience. We find that frontier model performance on GDPval is improving roughly linearly over time, and that the current best frontier models are approaching industry experts in deliverable quality. We analy"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"frontier model performance on GDPval is improving roughly linearly over time, and that the current best frontier models are approaching industry experts in deliverable quality","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"that the selected tasks and expert ratings accurately represent the full range of economically valuable work and that automated grading reliably matches human expert judgment on deliverable quality","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GDPval benchmark finds frontier AI models approaching industry experts on economically valuable tasks from high-GDP sectors, with linear performance gains over time.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Frontier AI models approach industry experts in quality on real-world economically valuable tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"598e2d628e61077b67f37794ca9742af834e58c489832ae066cfb08fa0c5dec1"},"source":{"id":"2510.04374","kind":"arxiv","version":1},"verdict":{"id":"774665a5-7f98-469d-87bc-f852f2954c87","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T11:08:12.153746Z","strongest_claim":"frontier model performance on GDPval is improving roughly linearly over time, and that the current best frontier models are approaching industry experts in deliverable quality","one_line_summary":"GDPval benchmark finds frontier AI models approaching industry experts on economically valuable tasks from high-GDP sectors, with linear performance gains over time.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"that the selected tasks and expert ratings accurately represent the full range of economically valuable work and that automated grading reliably matches human expert judgment on deliverable quality","pith_extraction_headline":"Frontier AI models approach industry experts in quality on real-world economically valuable tasks."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":3,"snapshot_sha256":"06fa6c1ba32e45605f782d631000b95db535fcc5c0770c0f5eb581c4bf9ea9ff"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.04374","created_at":"2026-05-17T23:38:48.066833+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.04374v1","created_at":"2026-05-17T23:38:48.066833+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.04374","created_at":"2026-05-17T23:38:48.066833+00:00"},{"alias_kind":"pith_short_12","alias_value":"C6BFU545DD7R","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"C6BFU545DD7RW6PM","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"C6BFU545","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":24,"internal_anchor_count":24,"sample":[{"citing_arxiv_id":"2605.23262","citing_title":"Design and Report Benchmarks for Knowledge Work","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23177","citing_title":"Cognitive offloading and the speedup illusion in human-AI interaction","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22664","citing_title":"WorkstreamBench: Evaluating LLM Agents on End-to-End Spreadsheet Tasks in Finance","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20520","citing_title":"Open-World Evaluations for Measuring Frontier AI Capabilities","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26904","citing_title":"ClawGym: A Scalable Framework for Building Effective Claw Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17554","citing_title":"Evaluating Deep Research Agents on Expert Consulting Work: A Benchmark with Verifiers, Rubrics, and Cognitive Traps","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2512.13168","citing_title":"Finch: Benchmarking Finance & Accounting across Spreadsheet-Centric Enterprise Workflows","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2602.09514","citing_title":"EcoGym: Evaluating LLMs for Long-Horizon Plan-and-Execute in Interactive Economies","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2602.17753","citing_title":"The 2025 AI Agent Index: Documenting Technical and Safety Features of Deployed Agentic AI Systems","ref_index":100,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26904","citing_title":"ClawGym: A Scalable Framework for Building Effective Claw Agents","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14455","citing_title":"Intelligence Impact Quotient (IIQ): A Framework for Measuring Organizational AI Impact","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03121","citing_title":"An Independent Safety Evaluation of Kimi K2.5","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12474","citing_title":"Reward Hacking in Rubric-Based Reinforcement Learning","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03295","citing_title":"Cheap Expertise: Mapping and Challenging Industry Perspectives in the Expert Data Gig Economy","ref_index":104,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23897","citing_title":"MarketBench: Evaluating AI Agents as Market Participants","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11304","citing_title":"BankerToolBench: Evaluating AI Agents in End-to-End Investment Banking Workflows","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10866","citing_title":"OccuBench: Evaluating AI Agents on Real-World Professional Tasks via Language Environment Simulation","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09836","citing_title":"COMPOSITE-Stem","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2602.15763","citing_title":"GLM-5: from Vibe Coding to Agentic Engineering","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05912","citing_title":"FrontierFinance: A Long-Horizon Computer-Use Benchmark of Real-World Financial Tasks","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2602.02276","citing_title":"Kimi K2.5: Visual Agentic Intelligence","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15597","citing_title":"LLMs Corrupt Your Documents When You Delegate","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16170","citing_title":"neuralCAD-Edit: An Expert Benchmark for Multimodal-Instructed 3D CAD Model Editing","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17308","citing_title":"SkillFlow:Benchmarking Lifelong Skill Discovery and Evolution for Autonomous Agents","ref_index":28,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":3,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C","json":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C.json","graph_json":"https://pith.science/api/pith-number/C6BFU545DD7RW6PMM2VTJX7B2C/graph.json","events_json":"https://pith.science/api/pith-number/C6BFU545DD7RW6PMM2VTJX7B2C/events.json","paper":"https://pith.science/paper/C6BFU545"},"agent_actions":{"view_html":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C","download_json":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C.json","view_paper":"https://pith.science/paper/C6BFU545","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.04374&json=true","fetch_graph":"https://pith.science/api/pith-number/C6BFU545DD7RW6PMM2VTJX7B2C/graph.json","fetch_events":"https://pith.science/api/pith-number/C6BFU545DD7RW6PMM2VTJX7B2C/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C/action/timestamp_anchor","attest_storage":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C/action/storage_attestation","attest_author":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C/action/author_attestation","sign_citation":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C/action/citation_signature","submit_replication":"https://pith.science/pith/C6BFU545DD7RW6PMM2VTJX7B2C/action/replication_record"}},"created_at":"2026-05-17T23:38:48.066833+00:00","updated_at":"2026-05-17T23:38:48.066833+00:00"}