{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:KK2IW4D3JA76ZH2NES5GMVA7B5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7f4d707bb200a42b97a4284f450a39dc97e9d4e6fec6fd9ccfdb4e6cd617fb31","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","title_canon_sha256":"857abb79fbf82dbbc3f095bd0ce20cbbe342e21bc27ce6f28fdc5e545476bc53"},"schema_version":"1.0","source":{"id":"2605.18824","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.18824","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"arxiv_version","alias_value":"2605.18824v1","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18824","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_12","alias_value":"KK2IW4D3JA76","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_16","alias_value":"KK2IW4D3JA76ZH2N","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_8","alias_value":"KK2IW4D3","created_at":"2026-05-20T00:06:24Z"}],"graph_snapshots":[{"event_id":"sha256:1e2b894fd63028a31d9a194c8bcfee4366de523002ab6191fa60991c9a586061","target":"graph","created_at":"2026-05-20T00:06:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.18824/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Evaluation of foundation models often rely on aggregate scores from benchmarks that lack comprehensive coverage and metadata for a fine-grained evaluation. We introduce a framework for automated benchmark generation. Our framework generates evaluation problems grounded in reference material, such as textbooks, producing benchmarks with broad coverage, rich metadata, and robustness to contamination. The pipeline employs a multi-agent architecture for problem generation and a solution-graph-driven strategy that significantly improves the reliability of ground truth solutions. Using the framework","authors_text":"Afshin Cheraghi, Ali Kore, Arash Afkanpour, Elham Dolatabadi, Farnaz Kohankhaki, Mohammed Saidul Islam, Negin Baghbanzadeh, Shayaan Mehdi","cross_cats":["cs.AI","cs.CL"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","title":"Fine-Grained Benchmark Generation for Comprehensive Evaluation of Foundation Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18824","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d66dbf4fac1766d318a11c91d02ec64fe6acc66c64dec532decc8dc906730737","target":"record","created_at":"2026-05-20T00:06:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7f4d707bb200a42b97a4284f450a39dc97e9d4e6fec6fd9ccfdb4e6cd617fb31","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","title_canon_sha256":"857abb79fbf82dbbc3f095bd0ce20cbbe342e21bc27ce6f28fdc5e545476bc53"},"schema_version":"1.0","source":{"id":"2605.18824","kind":"arxiv","version":1}},"canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","first_computed_at":"2026-05-20T00:06:24.533563Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:06:24.533563Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5MgLg/6kaJr1NBVcRphTKL6CdHNXzcflfkmNr53ccrOp3pD42pDUriccOziyWzcOSFoVuvlYNp5IdeSo7y5cAA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:06:24.534301Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.18824","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d66dbf4fac1766d318a11c91d02ec64fe6acc66c64dec532decc8dc906730737","sha256:1e2b894fd63028a31d9a194c8bcfee4366de523002ab6191fa60991c9a586061"],"state_sha256":"819862a07744a52e04d169cb845705bd5ee9da60629ad1c291d086b413f63074"}