{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:A2JKDCHHRVJTZNMZSYNZ4DFZPJ","short_pith_number":"pith:A2JKDCHH","schema_version":"1.0","canonical_sha256":"0692a188e78d533cb599961b9e0cb97a6079a414ecfdd85b52ac905d354f5bc7","source":{"kind":"arxiv","id":"2605.14164","version":1},"attestation_state":"computed","paper":{"title":"Unsteady Metrics and Benchmarking Cultures of AI Model Builders","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"AI builders select benchmarks to fit marketing narratives rather than enable consistent scientific comparison.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Christo Buschek, Maty Bohacek, Stefan Baack","submitted_at":"2026-05-13T22:39:10Z","abstract_excerpt":"The primary way to establish and compare competencies in foundation and generative AI models has shifted from peer-reviewed literature to press releases and company blog posts, where model builders highlight results on selected benchmarks. These artifacts now largely define the state of the art for researchers and the public. Despite their prominence, which benchmarks model builders choose to highlight, and what they communicate through this selection, is underexamined. To investigate, we introduce and open-source Benchmarking-Cultures-25, a dataset of 231 benchmarks highlighted across 139 mod"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.14164","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-13T22:39:10Z","cross_cats_sorted":[],"title_canon_sha256":"9e49ad973a3a1b8668c06496813326b3b941c1390bdc917806d74f9199b4820e","abstract_canon_sha256":"11b0518dd17c08084c46cafee5b859f185a629992a6d0fd01e68ea2bfd3d041f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:11.437108Z","signature_b64":"undXltK/MboVepJQL0KFA/0qR6VgvsMPqo5XzUgL7iW7RzvuV3nZxm/yzBN0y3fFpApLAOjTXfNMvMwQd6/hAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0692a188e78d533cb599961b9e0cb97a6079a414ecfdd85b52ac905d354f5bc7","last_reissued_at":"2026-05-17T23:39:11.436409Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:11.436409Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Unsteady Metrics and Benchmarking Cultures of AI Model Builders","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"AI builders select benchmarks to fit marketing narratives rather than enable consistent scientific comparison.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Christo Buschek, Maty Bohacek, Stefan Baack","submitted_at":"2026-05-13T22:39:10Z","abstract_excerpt":"The primary way to establish and compare competencies in foundation and generative AI models has shifted from peer-reviewed literature to press releases and company blog posts, where model builders highlight results on selected benchmarks. These artifacts now largely define the state of the art for researchers and the public. Despite their prominence, which benchmarks model builders choose to highlight, and what they communicate through this selection, is underexamined. To investigate, we introduce and open-source Benchmarking-Cultures-25, a dataset of 231 benchmarks highlighted across 139 mod"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We argue that highlighted benchmarks function less as standardized measurement tools and more as flexible narrative devices prioritizing market positioning over scientific evaluation.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the 139 model releases from 11 major builders and the 231 highlighted benchmarks they chose accurately capture the dominant evaluation practices and narrative strategies across the industry in 2025.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"AI model builders mostly highlight unique benchmarks that act as flexible narrative tools for market positioning rather than standardized scientific measurements.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"AI builders select benchmarks to fit marketing narratives rather than enable consistent scientific comparison.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"55b605042e85137d5606b2c41e0a338c7fec09b3eb2ddce4d6a32d206c728529"},"source":{"id":"2605.14164","kind":"arxiv","version":1},"verdict":{"id":"424feaaf-8644-4bde-86ae-be1db6eff79b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T04:51:28.083752Z","strongest_claim":"We argue that highlighted benchmarks function less as standardized measurement tools and more as flexible narrative devices prioritizing market positioning over scientific evaluation.","one_line_summary":"AI model builders mostly highlight unique benchmarks that act as flexible narrative tools for market positioning rather than standardized scientific measurements.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the 139 model releases from 11 major builders and the 231 highlighted benchmarks they chose accurately capture the dominant evaluation practices and narrative strategies across the industry in 2025.","pith_extraction_headline":"AI builders select benchmarks to fit marketing narratives rather than enable consistent scientific comparison."},"references":{"count":69,"sample":[{"doi":"10.1145/3461702.3462563","year":2021,"title":"Mohamed Abdalla and Moustafa Abdalla. 2021. The Grey Hoodie Project: Big Tobacco, Big Tech, and the Threat on Academic Integrity. InProceedings of the 2021 AAAI/ACM Conference on AI, Ethics, and Socie","work_id":"58b22604-8a7c-4493-a7db-173638a1ef58","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Norah Alzahrani, Hisham Alyahya, Yazeed Alnumay, Sultan Alrashed, Shaykhah Alsubaie, Yousef Almushayqih, Faisal Mirza, Nouf Alotaibi, Nora Al-Twairesh, Areeb Alowisheq, et al. 2024. When benchmarks ar","work_id":"c655b0b6-7bd9-40fa-9a0c-88020dede1b0","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv.1606.06565","year":2016,"title":"Concrete Problems in AI Safety","work_id":"c8d14fbe-6eab-464a-95b3-778aabd82fa3","ref_index":3,"cited_arxiv_id":"1606.06565","is_internal_anchor":true},{"doi":"","year":2025,"title":"Anthropic. 2025. Claude 3.7 Sonnet System Card","work_id":"af5c65cb-7c3f-4953-a525-fef950b52602","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"InCOLING 2004: Pro- ceedings of the 20th International Conference on Computational Linguistics, pages 106–112, Geneva, Switzerland","work_id":"dfbd1f62-7719-47db-97f9-5603a1f2dc22","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":69,"snapshot_sha256":"814ebbbf56d7fa541d1412618f13e2e418232976ef53a045efab22c71be80bd3","internal_anchors":10},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.14164","created_at":"2026-05-17T23:39:11.436540+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.14164v1","created_at":"2026-05-17T23:39:11.436540+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14164","created_at":"2026-05-17T23:39:11.436540+00:00"},{"alias_kind":"pith_short_12","alias_value":"A2JKDCHHRVJT","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"A2JKDCHHRVJTZNMZ","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"A2JKDCHH","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ","json":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ.json","graph_json":"https://pith.science/api/pith-number/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/graph.json","events_json":"https://pith.science/api/pith-number/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/events.json","paper":"https://pith.science/paper/A2JKDCHH"},"agent_actions":{"view_html":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ","download_json":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ.json","view_paper":"https://pith.science/paper/A2JKDCHH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.14164&json=true","fetch_graph":"https://pith.science/api/pith-number/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/graph.json","fetch_events":"https://pith.science/api/pith-number/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/action/storage_attestation","attest_author":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/action/author_attestation","sign_citation":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/action/citation_signature","submit_replication":"https://pith.science/pith/A2JKDCHHRVJTZNMZSYNZ4DFZPJ/action/replication_record"}},"created_at":"2026-05-17T23:39:11.436540+00:00","updated_at":"2026-05-17T23:39:11.436540+00:00"}