{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:YXRCLVYEFMT2ZYMYSC4INW4OWL","short_pith_number":"pith:YXRCLVYE","schema_version":"1.0","canonical_sha256":"c5e225d7042b27ace19890b886db8eb2d7b623f3986c589de0d5b6d00a167b63","source":{"kind":"arxiv","id":"2605.11599","version":2},"attestation_state":"computed","paper":{"title":"Targeted Tests for LLM Reasoning: An Audit-Constrained Protocol","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"An audit-constrained protocol identifies genuine LLM reasoning errors from valid prompt variants while excluding artifacts, yet adaptive sampling yields no advantage over uniform sampling.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Hongmin Li","submitted_at":"2026-05-12T06:26:22Z","abstract_excerpt":"Fixed reasoning benchmarks evaluate canonical prompts, but semantically valid changes in presentation can still change model behavior. Studies of prompt variation can reveal such failures, but without audit they can mix genuine model errors with invalid perturbations, extraction artifacts, and unmatched search procedures. We propose an audit-constrained protocol for targeted reasoning evaluation. Prompt variants are generated from a finite component grammar, rendered deterministically, evaluated under a fixed query budget, and counted as model errors only after semantic and extraction audit. W"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.11599","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:26:22Z","cross_cats_sorted":[],"title_canon_sha256":"0fbb8e1eeb3329a7cf2eac350bd850b477d372102c5210d245d6cdec773163c0","abstract_canon_sha256":"63fd56cab966ff5f00f4b8321f89c673d54e08abb75401aa9e0537b7e965603c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:17.914468Z","signature_b64":"hao9zjCkxe0cT1ullmkPk9tjBWeAmsCFmuUp2k3zNZESY4nc8EcIBYPjDdqWpSVksGf4pOOUS4Co1jkwMMlQDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c5e225d7042b27ace19890b886db8eb2d7b623f3986c589de0d5b6d00a167b63","last_reissued_at":"2026-05-20T00:03:17.913442Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:17.913442Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Targeted Tests for LLM Reasoning: An Audit-Constrained Protocol","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"An audit-constrained protocol identifies genuine LLM reasoning errors from valid prompt variants while excluding artifacts, yet adaptive sampling yields no advantage over uniform sampling.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Hongmin Li","submitted_at":"2026-05-12T06:26:22Z","abstract_excerpt":"Fixed reasoning benchmarks evaluate canonical prompts, but semantically valid changes in presentation can still change model behavior. Studies of prompt variation can reveal such failures, but without audit they can mix genuine model errors with invalid perturbations, extraction artifacts, and unmatched search procedures. We propose an audit-constrained protocol for targeted reasoning evaluation. Prompt variants are generated from a finite component grammar, rendered deterministically, evaluated under a fixed query budget, and counted as model errors only after semantic and extraction audit. W"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across three audited slices, the protocol identifies confirmed model-error prompt keys while excluding formatting and extraction artifacts, but matched comparisons do not show that CAPS improves audited yield or unique prompt-key discovery over uniform sampling.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the semantic and extraction audit procedure reliably and consistently distinguishes genuine model reasoning errors from invalid perturbations, extraction artifacts, and unmatched search procedures without introducing its own biases or omissions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"An audit-constrained protocol for LLM reasoning tests finds that component-adaptive prompt sampling yields no improvement over uniform sampling in identifying confirmed model errors after semantic and extraction audits.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"An audit-constrained protocol identifies genuine LLM reasoning errors from valid prompt variants while excluding artifacts, yet adaptive sampling yields no advantage over uniform sampling.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"85cd0275ed66d9212bd747b16668ef42e06ca206cc7da569df538465dec12717"},"source":{"id":"2605.11599","kind":"arxiv","version":2},"verdict":{"id":"1858ad86-25d9-4581-a166-aabbe5a91b7e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-13T01:31:06.740966Z","strongest_claim":"Across three audited slices, the protocol identifies confirmed model-error prompt keys while excluding formatting and extraction artifacts, but matched comparisons do not show that CAPS improves audited yield or unique prompt-key discovery over uniform sampling.","one_line_summary":"An audit-constrained protocol for LLM reasoning tests finds that component-adaptive prompt sampling yields no improvement over uniform sampling in identifying confirmed model errors after semantic and extraction audits.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the semantic and extraction audit procedure reliably and consistently distinguishes genuine model reasoning errors from invalid perturbations, extraction artifacts, and unmatched search procedures without introducing its own biases or omissions.","pith_extraction_headline":"An audit-constrained protocol identifies genuine LLM reasoning errors from valid prompt variants while excluding artifacts, yet adaptive sampling yields no advantage over uniform sampling."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.11599/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"ai_meta_artifact","ran_at":"2026-05-19T11:41:09.568476Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T09:31:18.230533Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T08:18:07.155182Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"598c4750098f83e152f72da8bc13bbbff4547ea30a94d7390672c5aa3780539e"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.11599","created_at":"2026-05-20T00:03:17.913587+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.11599v2","created_at":"2026-05-20T00:03:17.913587+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.11599","created_at":"2026-05-20T00:03:17.913587+00:00"},{"alias_kind":"pith_short_12","alias_value":"YXRCLVYEFMT2","created_at":"2026-05-20T00:03:17.913587+00:00"},{"alias_kind":"pith_short_16","alias_value":"YXRCLVYEFMT2ZYMY","created_at":"2026-05-20T00:03:17.913587+00:00"},{"alias_kind":"pith_short_8","alias_value":"YXRCLVYE","created_at":"2026-05-20T00:03:17.913587+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL","json":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL.json","graph_json":"https://pith.science/api/pith-number/YXRCLVYEFMT2ZYMYSC4INW4OWL/graph.json","events_json":"https://pith.science/api/pith-number/YXRCLVYEFMT2ZYMYSC4INW4OWL/events.json","paper":"https://pith.science/paper/YXRCLVYE"},"agent_actions":{"view_html":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL","download_json":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL.json","view_paper":"https://pith.science/paper/YXRCLVYE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.11599&json=true","fetch_graph":"https://pith.science/api/pith-number/YXRCLVYEFMT2ZYMYSC4INW4OWL/graph.json","fetch_events":"https://pith.science/api/pith-number/YXRCLVYEFMT2ZYMYSC4INW4OWL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL/action/storage_attestation","attest_author":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL/action/author_attestation","sign_citation":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL/action/citation_signature","submit_replication":"https://pith.science/pith/YXRCLVYEFMT2ZYMYSC4INW4OWL/action/replication_record"}},"created_at":"2026-05-20T00:03:17.913587+00:00","updated_at":"2026-05-20T00:03:17.913587+00:00"}