{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:DR3KKIIQOT375DTXKWCILJUQHT","short_pith_number":"pith:DR3KKIIQ","canonical_record":{"source":{"id":"2605.26079","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-25T17:44:21Z","cross_cats_sorted":[],"title_canon_sha256":"37a19d14c5bf40aa0f13161095542ae175558ce46e8bc95cac4c6bdedabae87c","abstract_canon_sha256":"172d2b08e07480c0bfee6360b0f5369261f4114b9cf731f963762cb859d99441"},"schema_version":"1.0"},"canonical_sha256":"1c76a5211074f7fe8e77558485a6903cdf6e33f495023e4feb7237b0c1ad291e","source":{"kind":"arxiv","id":"2605.26079","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.26079","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"arxiv_version","alias_value":"2605.26079v1","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26079","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"pith_short_12","alias_value":"DR3KKIIQOT37","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"pith_short_16","alias_value":"DR3KKIIQOT375DTX","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"pith_short_8","alias_value":"DR3KKIIQ","created_at":"2026-05-26T02:05:26Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:DR3KKIIQOT375DTXKWCILJUQHT","target":"record","payload":{"canonical_record":{"source":{"id":"2605.26079","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-25T17:44:21Z","cross_cats_sorted":[],"title_canon_sha256":"37a19d14c5bf40aa0f13161095542ae175558ce46e8bc95cac4c6bdedabae87c","abstract_canon_sha256":"172d2b08e07480c0bfee6360b0f5369261f4114b9cf731f963762cb859d99441"},"schema_version":"1.0"},"canonical_sha256":"1c76a5211074f7fe8e77558485a6903cdf6e33f495023e4feb7237b0c1ad291e","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:05:26.207418Z","signature_b64":"RR5PBduEKNxYeJBLSw31JqgVaHZUYjfjManaFLfqjhylgf67Ursk+2hUaRH4BwqNd4n/LkKhUqerb48BYAvPBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1c76a5211074f7fe8e77558485a6903cdf6e33f495023e4feb7237b0c1ad291e","last_reissued_at":"2026-05-26T02:05:26.206702Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:05:26.206702Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.26079","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T02:05:26Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"5z+I/5JEi1i/XFNan9ORKA8duSeTkA595eMJ8TBvEgBYi5ruO25HMWbh+e12ogteyC3GBJWUtBu0ZgpJMylKCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T22:54:06.982353Z"},"content_sha256":"67ebe7a97ed9e31b37990de0429e82fb5bb19401d27dc505e789b552143515bd","schema_version":"1.0","event_id":"sha256:67ebe7a97ed9e31b37990de0429e82fb5bb19401d27dc505e789b552143515bd"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:DR3KKIIQOT375DTXKWCILJUQHT","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Automated Benchmark Auditing for AI Agents and Large Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bhuwan Dhingra, Fan Nie, Federico Bianchi, James Zou, Junlin Wang, Shang Zhu, Yongchan Kwon","submitted_at":"2026-05-25T17:44:21Z","abstract_excerpt":"Modern AI benchmarks operate at a complexity that outpaces traditional verification methods. Tasks authored by domain experts often contain implicit assumptions, incomplete environment specifications, and brittle evaluation logic that human annotation cannot reliably catch. We introduce Auto Benchmark Audit (ABA), an agentic framework that systematically audits individual benchmark tasks, uncovering issues such as hidden environment dependencies, specification gaps, and limited grading logic. We run ABA on a collection of frontier LLM benchmarks and previous NeurIPS publications, totaling 168 "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26079","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.26079/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T02:05:26Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"d00fuhX7inz1l2bYHipIO9Zc19HuOhRYhHDmkStRQxvhZk4Oys/mwceR4b1e6kUl5BfGbJ0bps3QiOIOn85MAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T22:54:06.982992Z"},"content_sha256":"a1f30e20f7bc16801ff4cf675573cd82454dbfe6b71a49e23a592fd9fc654069","schema_version":"1.0","event_id":"sha256:a1f30e20f7bc16801ff4cf675573cd82454dbfe6b71a49e23a592fd9fc654069"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DR3KKIIQOT375DTXKWCILJUQHT/bundle.json","state_url":"https://pith.science/pith/DR3KKIIQOT375DTXKWCILJUQHT/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DR3KKIIQOT375DTXKWCILJUQHT/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T22:54:06Z","links":{"resolver":"https://pith.science/pith/DR3KKIIQOT375DTXKWCILJUQHT","bundle":"https://pith.science/pith/DR3KKIIQOT375DTXKWCILJUQHT/bundle.json","state":"https://pith.science/pith/DR3KKIIQOT375DTXKWCILJUQHT/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DR3KKIIQOT375DTXKWCILJUQHT/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:DR3KKIIQOT375DTXKWCILJUQHT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"172d2b08e07480c0bfee6360b0f5369261f4114b9cf731f963762cb859d99441","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-25T17:44:21Z","title_canon_sha256":"37a19d14c5bf40aa0f13161095542ae175558ce46e8bc95cac4c6bdedabae87c"},"schema_version":"1.0","source":{"id":"2605.26079","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.26079","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"arxiv_version","alias_value":"2605.26079v1","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26079","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"pith_short_12","alias_value":"DR3KKIIQOT37","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"pith_short_16","alias_value":"DR3KKIIQOT375DTX","created_at":"2026-05-26T02:05:26Z"},{"alias_kind":"pith_short_8","alias_value":"DR3KKIIQ","created_at":"2026-05-26T02:05:26Z"}],"graph_snapshots":[{"event_id":"sha256:a1f30e20f7bc16801ff4cf675573cd82454dbfe6b71a49e23a592fd9fc654069","target":"graph","created_at":"2026-05-26T02:05:26Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.26079/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Modern AI benchmarks operate at a complexity that outpaces traditional verification methods. Tasks authored by domain experts often contain implicit assumptions, incomplete environment specifications, and brittle evaluation logic that human annotation cannot reliably catch. We introduce Auto Benchmark Audit (ABA), an agentic framework that systematically audits individual benchmark tasks, uncovering issues such as hidden environment dependencies, specification gaps, and limited grading logic. We run ABA on a collection of frontier LLM benchmarks and previous NeurIPS publications, totaling 168 ","authors_text":"Bhuwan Dhingra, Fan Nie, Federico Bianchi, James Zou, Junlin Wang, Shang Zhu, Yongchan Kwon","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-25T17:44:21Z","title":"Automated Benchmark Auditing for AI Agents and Large Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26079","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:67ebe7a97ed9e31b37990de0429e82fb5bb19401d27dc505e789b552143515bd","target":"record","created_at":"2026-05-26T02:05:26Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"172d2b08e07480c0bfee6360b0f5369261f4114b9cf731f963762cb859d99441","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-25T17:44:21Z","title_canon_sha256":"37a19d14c5bf40aa0f13161095542ae175558ce46e8bc95cac4c6bdedabae87c"},"schema_version":"1.0","source":{"id":"2605.26079","kind":"arxiv","version":1}},"canonical_sha256":"1c76a5211074f7fe8e77558485a6903cdf6e33f495023e4feb7237b0c1ad291e","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"1c76a5211074f7fe8e77558485a6903cdf6e33f495023e4feb7237b0c1ad291e","first_computed_at":"2026-05-26T02:05:26.206702Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-26T02:05:26.206702Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"RR5PBduEKNxYeJBLSw31JqgVaHZUYjfjManaFLfqjhylgf67Ursk+2hUaRH4BwqNd4n/LkKhUqerb48BYAvPBg==","signature_status":"signed_v1","signed_at":"2026-05-26T02:05:26.207418Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.26079","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:67ebe7a97ed9e31b37990de0429e82fb5bb19401d27dc505e789b552143515bd","sha256:a1f30e20f7bc16801ff4cf675573cd82454dbfe6b71a49e23a592fd9fc654069"],"state_sha256":"d3aeb48607d52ac2d4f5a02ac4d47f73b760984ad50d1490faf285c1d181ca85"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"zPQYpa6ODLW6nEEGZGo2nOQZ1v29vhY6ZxVKrCt6fE19J8/s/lwmXr6fh3vkUw2hQnQIKdllvg7CcqNFhMlPBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T22:54:06.986662Z","bundle_sha256":"a0966cedf10ae6e31b436910855dde9fa4ae32deb8a3dddda10a12f3fb65467f"}}