{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:YRPV7OER4IF4TD2TRUR764XMPB","short_pith_number":"pith:YRPV7OER","schema_version":"1.0","canonical_sha256":"c45f5fb891e20bc98f538d23ff72ec787ef82e03e86ff43721825f3111eba97f","source":{"kind":"arxiv","id":"2606.17930","version":1},"attestation_state":"computed","paper":{"title":"How Inference Compute Shapes Frontier LLM Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Cozmin Ududec, Harry Coppock, Jessica McFadyen, Kevin Wei, Ole Jorgensen","submitted_at":"2026-06-16T13:40:53Z","abstract_excerpt":"AI evaluations are shifting toward harder tasks that benefit from longer trajectories involving tool use and iterative problem solving. As a result, performance is increasingly sensitive to the amount and allocation of compute available at test time (\"inference compute\"). Yet many evaluations still report performance at a single restrictive budget, meaning that low scores may reflect the evaluation setup rather than the model's underlying capability. To test this, we evaluate up to 12 frontier language models on seven challenging benchmarks spanning software engineering, mathematics, medicine,"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.17930","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-06-16T13:40:53Z","cross_cats_sorted":[],"title_canon_sha256":"4fa95d77f39cf4b12fecd745162fefb61ca5c84fbf624831a9e71797f6c83aa8","abstract_canon_sha256":"960d220deb0099dd439c7e5769554ee130fd526e953035becc02436d7355111d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:10:43.793902Z","signature_b64":"sxtLx6tfQlhMkVloMdXv2LG848919IoSbJZlu6oi2TxN9TiDNDIK/Zu8+NtLpV3+YK/QPtXx/gZnFTe6YVT/DA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c45f5fb891e20bc98f538d23ff72ec787ef82e03e86ff43721825f3111eba97f","last_reissued_at":"2026-06-19T16:10:43.793564Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:10:43.793564Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How Inference Compute Shapes Frontier LLM Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Cozmin Ududec, Harry Coppock, Jessica McFadyen, Kevin Wei, Ole Jorgensen","submitted_at":"2026-06-16T13:40:53Z","abstract_excerpt":"AI evaluations are shifting toward harder tasks that benefit from longer trajectories involving tool use and iterative problem solving. As a result, performance is increasingly sensitive to the amount and allocation of compute available at test time (\"inference compute\"). Yet many evaluations still report performance at a single restrictive budget, meaning that low scores may reflect the evaluation setup rather than the model's underlying capability. To test this, we evaluate up to 12 frontier language models on seven challenging benchmarks spanning software engineering, mathematics, medicine,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.17930","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.17930/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.17930","created_at":"2026-06-19T16:10:43.793625+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.17930v1","created_at":"2026-06-19T16:10:43.793625+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.17930","created_at":"2026-06-19T16:10:43.793625+00:00"},{"alias_kind":"pith_short_12","alias_value":"YRPV7OER4IF4","created_at":"2026-06-19T16:10:43.793625+00:00"},{"alias_kind":"pith_short_16","alias_value":"YRPV7OER4IF4TD2T","created_at":"2026-06-19T16:10:43.793625+00:00"},{"alias_kind":"pith_short_8","alias_value":"YRPV7OER","created_at":"2026-06-19T16:10:43.793625+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB","json":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB.json","graph_json":"https://pith.science/api/pith-number/YRPV7OER4IF4TD2TRUR764XMPB/graph.json","events_json":"https://pith.science/api/pith-number/YRPV7OER4IF4TD2TRUR764XMPB/events.json","paper":"https://pith.science/paper/YRPV7OER"},"agent_actions":{"view_html":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB","download_json":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB.json","view_paper":"https://pith.science/paper/YRPV7OER","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.17930&json=true","fetch_graph":"https://pith.science/api/pith-number/YRPV7OER4IF4TD2TRUR764XMPB/graph.json","fetch_events":"https://pith.science/api/pith-number/YRPV7OER4IF4TD2TRUR764XMPB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB/action/storage_attestation","attest_author":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB/action/author_attestation","sign_citation":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB/action/citation_signature","submit_replication":"https://pith.science/pith/YRPV7OER4IF4TD2TRUR764XMPB/action/replication_record"}},"created_at":"2026-06-19T16:10:43.793625+00:00","updated_at":"2026-06-19T16:10:43.793625+00:00"}