{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:KK2IW4D3JA76ZH2NES5GMVA7B5","short_pith_number":"pith:KK2IW4D3","canonical_record":{"source":{"id":"2605.18824","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"857abb79fbf82dbbc3f095bd0ce20cbbe342e21bc27ce6f28fdc5e545476bc53","abstract_canon_sha256":"7f4d707bb200a42b97a4284f450a39dc97e9d4e6fec6fd9ccfdb4e6cd617fb31"},"schema_version":"1.0"},"canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","source":{"kind":"arxiv","id":"2605.18824","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.18824","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"arxiv_version","alias_value":"2605.18824v1","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18824","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_12","alias_value":"KK2IW4D3JA76","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_16","alias_value":"KK2IW4D3JA76ZH2N","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_8","alias_value":"KK2IW4D3","created_at":"2026-05-20T00:06:24Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:KK2IW4D3JA76ZH2NES5GMVA7B5","target":"record","payload":{"canonical_record":{"source":{"id":"2605.18824","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"857abb79fbf82dbbc3f095bd0ce20cbbe342e21bc27ce6f28fdc5e545476bc53","abstract_canon_sha256":"7f4d707bb200a42b97a4284f450a39dc97e9d4e6fec6fd9ccfdb4e6cd617fb31"},"schema_version":"1.0"},"canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:06:24.534301Z","signature_b64":"5MgLg/6kaJr1NBVcRphTKL6CdHNXzcflfkmNr53ccrOp3pD42pDUriccOziyWzcOSFoVuvlYNp5IdeSo7y5cAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","last_reissued_at":"2026-05-20T00:06:24.533563Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:06:24.533563Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.18824","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:06:24Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"jhx6SV6HXFyLdRMj9Pt4i6FgasXUit5E13ErskCZWy97yhRajZGggmOky/v+n42np5x4vPZdHFwefPOFdcd1CQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T07:50:54.203886Z"},"content_sha256":"d66dbf4fac1766d318a11c91d02ec64fe6acc66c64dec532decc8dc906730737","schema_version":"1.0","event_id":"sha256:d66dbf4fac1766d318a11c91d02ec64fe6acc66c64dec532decc8dc906730737"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:KK2IW4D3JA76ZH2NES5GMVA7B5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Fine-Grained Benchmark Generation for Comprehensive Evaluation of Foundation Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Afshin Cheraghi, Ali Kore, Arash Afkanpour, Elham Dolatabadi, Farnaz Kohankhaki, Mohammed Saidul Islam, Negin Baghbanzadeh, Shayaan Mehdi","submitted_at":"2026-05-12T17:01:58Z","abstract_excerpt":"Evaluation of foundation models often rely on aggregate scores from benchmarks that lack comprehensive coverage and metadata for a fine-grained evaluation. We introduce a framework for automated benchmark generation. Our framework generates evaluation problems grounded in reference material, such as textbooks, producing benchmarks with broad coverage, rich metadata, and robustness to contamination. The pipeline employs a multi-agent architecture for problem generation and a solution-graph-driven strategy that significantly improves the reliability of ground truth solutions. Using the framework"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18824","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.18824/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:06:24Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OIshOcKiupauwU+Gajl+eTMtfcZ9BWjoJ0Jp8Fzwxb4DnTBAmB2YEvxpuDGchq+DpcbDF+wZi33MMAqT+NDFBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T07:50:54.204688Z"},"content_sha256":"1e2b894fd63028a31d9a194c8bcfee4366de523002ab6191fa60991c9a586061","schema_version":"1.0","event_id":"sha256:1e2b894fd63028a31d9a194c8bcfee4366de523002ab6191fa60991c9a586061"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/KK2IW4D3JA76ZH2NES5GMVA7B5/bundle.json","state_url":"https://pith.science/pith/KK2IW4D3JA76ZH2NES5GMVA7B5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/KK2IW4D3JA76ZH2NES5GMVA7B5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T07:50:54Z","links":{"resolver":"https://pith.science/pith/KK2IW4D3JA76ZH2NES5GMVA7B5","bundle":"https://pith.science/pith/KK2IW4D3JA76ZH2NES5GMVA7B5/bundle.json","state":"https://pith.science/pith/KK2IW4D3JA76ZH2NES5GMVA7B5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/KK2IW4D3JA76ZH2NES5GMVA7B5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:KK2IW4D3JA76ZH2NES5GMVA7B5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7f4d707bb200a42b97a4284f450a39dc97e9d4e6fec6fd9ccfdb4e6cd617fb31","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","title_canon_sha256":"857abb79fbf82dbbc3f095bd0ce20cbbe342e21bc27ce6f28fdc5e545476bc53"},"schema_version":"1.0","source":{"id":"2605.18824","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.18824","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"arxiv_version","alias_value":"2605.18824v1","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18824","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_12","alias_value":"KK2IW4D3JA76","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_16","alias_value":"KK2IW4D3JA76ZH2N","created_at":"2026-05-20T00:06:24Z"},{"alias_kind":"pith_short_8","alias_value":"KK2IW4D3","created_at":"2026-05-20T00:06:24Z"}],"graph_snapshots":[{"event_id":"sha256:1e2b894fd63028a31d9a194c8bcfee4366de523002ab6191fa60991c9a586061","target":"graph","created_at":"2026-05-20T00:06:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.18824/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Evaluation of foundation models often rely on aggregate scores from benchmarks that lack comprehensive coverage and metadata for a fine-grained evaluation. We introduce a framework for automated benchmark generation. Our framework generates evaluation problems grounded in reference material, such as textbooks, producing benchmarks with broad coverage, rich metadata, and robustness to contamination. The pipeline employs a multi-agent architecture for problem generation and a solution-graph-driven strategy that significantly improves the reliability of ground truth solutions. Using the framework","authors_text":"Afshin Cheraghi, Ali Kore, Arash Afkanpour, Elham Dolatabadi, Farnaz Kohankhaki, Mohammed Saidul Islam, Negin Baghbanzadeh, Shayaan Mehdi","cross_cats":["cs.AI","cs.CL"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","title":"Fine-Grained Benchmark Generation for Comprehensive Evaluation of Foundation Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18824","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d66dbf4fac1766d318a11c91d02ec64fe6acc66c64dec532decc8dc906730737","target":"record","created_at":"2026-05-20T00:06:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7f4d707bb200a42b97a4284f450a39dc97e9d4e6fec6fd9ccfdb4e6cd617fb31","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:01:58Z","title_canon_sha256":"857abb79fbf82dbbc3f095bd0ce20cbbe342e21bc27ce6f28fdc5e545476bc53"},"schema_version":"1.0","source":{"id":"2605.18824","kind":"arxiv","version":1}},"canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"52b48b707b483fec9f4d24ba66541f0f7d811cb03715e0d0da721428bd6da04a","first_computed_at":"2026-05-20T00:06:24.533563Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:06:24.533563Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5MgLg/6kaJr1NBVcRphTKL6CdHNXzcflfkmNr53ccrOp3pD42pDUriccOziyWzcOSFoVuvlYNp5IdeSo7y5cAA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:06:24.534301Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.18824","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d66dbf4fac1766d318a11c91d02ec64fe6acc66c64dec532decc8dc906730737","sha256:1e2b894fd63028a31d9a194c8bcfee4366de523002ab6191fa60991c9a586061"],"state_sha256":"819862a07744a52e04d169cb845705bd5ee9da60629ad1c291d086b413f63074"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Vo+0P7bJdqkZb9EIh5pVQFZ6B/bMXuOGx6fRwyvhYfO1RT3cRk+p9SV4O6nL6GL0KY3GCJwGMddLbf7H1liNBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T07:50:54.208928Z","bundle_sha256":"8d8ec44e31a5b7504e837afe1526459baef78abe4338d58daff83ac6567fa969"}}