{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:NZPDVXNXRUX6HU5FEAADKKJ2BB","short_pith_number":"pith:NZPDVXNX","canonical_record":{"source":{"id":"2605.08678","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46Z","cross_cats_sorted":[],"title_canon_sha256":"6faf6078a5ee082d9603732aeb78f26559ebefbd7b5f8786355978f97a3d8060","abstract_canon_sha256":"27d4d2277f2bca2534f506576f1920452f62d34e2704ca2beaa2810cd04953c7"},"schema_version":"1.0"},"canonical_sha256":"6e5e3addb78d2fe3d3a5200035293a086f16fa3ee433f15379a80521a3e76351","source":{"kind":"arxiv","id":"2605.08678","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.08678","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"arxiv_version","alias_value":"2605.08678v2","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.08678","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"pith_short_12","alias_value":"NZPDVXNXRUX6","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"pith_short_16","alias_value":"NZPDVXNXRUX6HU5F","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"pith_short_8","alias_value":"NZPDVXNX","created_at":"2026-05-28T01:04:41Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:NZPDVXNXRUX6HU5FEAADKKJ2BB","target":"record","payload":{"canonical_record":{"source":{"id":"2605.08678","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46Z","cross_cats_sorted":[],"title_canon_sha256":"6faf6078a5ee082d9603732aeb78f26559ebefbd7b5f8786355978f97a3d8060","abstract_canon_sha256":"27d4d2277f2bca2534f506576f1920452f62d34e2704ca2beaa2810cd04953c7"},"schema_version":"1.0"},"canonical_sha256":"6e5e3addb78d2fe3d3a5200035293a086f16fa3ee433f15379a80521a3e76351","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T01:04:41.929077Z","signature_b64":"ddsg5O6JgPRRNBWRNWDvsOt1+yOlqdJstQR5szgLZ/mmgzRCJnXQ83x2jVn2yzpHjsgvVRO8fG0psC2EnHOCDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6e5e3addb78d2fe3d3a5200035293a086f16fa3ee433f15379a80521a3e76351","last_reissued_at":"2026-05-28T01:04:41.928394Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T01:04:41.928394Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.08678","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T01:04:41Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cS9yOGPmCIsa3VXQXQ5CKGtWsAoYTOjE/CAXwok1xG8FR9Fxyz0gLt/cS9VIJs2JoXQTlgWkwGDpY9gnUYCEAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T23:13:55.737522Z"},"content_sha256":"68e78d1d2389bbe2199dd6528622d8f7c1748a65192c76f47e4c6506bcee74e0","schema_version":"1.0","event_id":"sha256:68e78d1d2389bbe2199dd6528622d8f7c1748a65192c76f47e4c6506bcee74e0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:NZPDVXNXRUX6HU5FEAADKKJ2BB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MLS-Bench: A Holistic and Rigorous Assessment of AI Systems on Building Better AI","license":"http://creativecommons.org/licenses/by/4.0/","headline":"AI agents cannot reliably invent ML methods that beat human designs on generalization and scaling tests.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Bohan Lyu, Chengshuai Shi, Chi Jin, Dapeng Jiang, Dawn Song, Huan-ang Gao, Huaqing Zhang, Jiantao Jiao, Jiaru Zhang, Junlin Yang, Kaicheng Yang, Kun Wang, Max Simchowitz, Qixin Xu, Runhan Huang, Shange Tang, Simon S. Du, Siqiao Huang, Wenhao Chai, Wentao Guo, Xinghan Li, Xinyang Han, Xinyue Ai, Yadi Cao, Yicheng Zhang, Yucheng Yang, Ziran Yang, Zitao Chen","submitted_at":"2026-05-09T04:29:46Z","abstract_excerpt":"Modern AI progress has been driven by ML methods that are generalizable across settings and scalable to larger regimes. As large language models demonstrate advanced capabilities in reasoning, coding, and engineering tasks, it is increasingly important to understand whether they can discover such methods rather than only apply existing ones. We introduce MLS-Bench, a benchmark for evaluating whether AI systems can invent generalizable and scalable ML methods. MLS-Bench contains 140 tasks across 12 domains, each requiring an agent to improve one targeted component of an ML system or algorithm a"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Current agents remain far from reliably surpassing human-designed methods, and that engineering-style tuning is easier for them than genuine method invention.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 140 tasks and 12 domains sufficiently capture the core skills needed for inventing generalizable and scalable ML methods without missing key aspects of real research.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MLS-Bench shows that current AI agents fall short of reliably inventing generalizable ML methods, with engineering tuning easier than genuine invention.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"AI agents cannot reliably invent ML methods that beat human designs on generalization and scaling tests.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8b8d915ff40ccacb7f5d8912db0ea965691441e8d5b12bc90060c2abbb9bed0b"},"source":{"id":"2605.08678","kind":"arxiv","version":2},"verdict":{"id":"9dee8940-caf6-4638-9924-7a9e6b7bb70e","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-12T01:09:27.664741Z","strongest_claim":"Current agents remain far from reliably surpassing human-designed methods, and that engineering-style tuning is easier for them than genuine method invention.","one_line_summary":"MLS-Bench shows that current AI agents fall short of reliably inventing generalizable ML methods, with engineering tuning easier than genuine invention.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 140 tasks and 12 domains sufficiently capture the core skills needed for inventing generalizable and scalable ML methods without missing key aspects of real research.","pith_extraction_headline":"AI agents cannot reliably invent ML methods that beat human designs on generalization and scaling tests."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.08678/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T09:02:01.991054Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T22:36:24.375594Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T14:31:17.867448Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T10:52:08.963578Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"ff0f4c3a4ffb4048268288e2e835d3240235c5f69293815aa94d88cfc208d763"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"9dee8940-caf6-4638-9924-7a9e6b7bb70e"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T01:04:41Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"eSeOkBX1LmWYN4YgWum6KyNMnebcHoYnRkYtQ+ZJl2v3dnvzkld2x3AKOtEokxgdaQphIH5OSUh2d3K2nILtAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T23:13:55.738526Z"},"content_sha256":"22a61e557a11dc08efeb259de95faba8b21080f5ea4b93393eb4e51dd2eb410b","schema_version":"1.0","event_id":"sha256:22a61e557a11dc08efeb259de95faba8b21080f5ea4b93393eb4e51dd2eb410b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB/bundle.json","state_url":"https://pith.science/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T23:13:55Z","links":{"resolver":"https://pith.science/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB","bundle":"https://pith.science/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB/bundle.json","state":"https://pith.science/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NZPDVXNXRUX6HU5FEAADKKJ2BB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:NZPDVXNXRUX6HU5FEAADKKJ2BB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"27d4d2277f2bca2534f506576f1920452f62d34e2704ca2beaa2810cd04953c7","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46Z","title_canon_sha256":"6faf6078a5ee082d9603732aeb78f26559ebefbd7b5f8786355978f97a3d8060"},"schema_version":"1.0","source":{"id":"2605.08678","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.08678","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"arxiv_version","alias_value":"2605.08678v2","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.08678","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"pith_short_12","alias_value":"NZPDVXNXRUX6","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"pith_short_16","alias_value":"NZPDVXNXRUX6HU5F","created_at":"2026-05-28T01:04:41Z"},{"alias_kind":"pith_short_8","alias_value":"NZPDVXNX","created_at":"2026-05-28T01:04:41Z"}],"graph_snapshots":[{"event_id":"sha256:22a61e557a11dc08efeb259de95faba8b21080f5ea4b93393eb4e51dd2eb410b","target":"graph","created_at":"2026-05-28T01:04:41Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Current agents remain far from reliably surpassing human-designed methods, and that engineering-style tuning is easier for them than genuine method invention."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The 140 tasks and 12 domains sufficiently capture the core skills needed for inventing generalizable and scalable ML methods without missing key aspects of real research."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MLS-Bench shows that current AI agents fall short of reliably inventing generalizable ML methods, with engineering tuning easier than genuine invention."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"AI agents cannot reliably invent ML methods that beat human designs on generalization and scaling tests."}],"snapshot_sha256":"8b8d915ff40ccacb7f5d8912db0ea965691441e8d5b12bc90060c2abbb9bed0b"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-20T09:02:01.991054Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T22:36:24.375594Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T14:31:17.867448Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T10:52:08.963578Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.08678/integrity.json","findings":[],"snapshot_sha256":"ff0f4c3a4ffb4048268288e2e835d3240235c5f69293815aa94d88cfc208d763","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Modern AI progress has been driven by ML methods that are generalizable across settings and scalable to larger regimes. As large language models demonstrate advanced capabilities in reasoning, coding, and engineering tasks, it is increasingly important to understand whether they can discover such methods rather than only apply existing ones. We introduce MLS-Bench, a benchmark for evaluating whether AI systems can invent generalizable and scalable ML methods. MLS-Bench contains 140 tasks across 12 domains, each requiring an agent to improve one targeted component of an ML system or algorithm a","authors_text":"Bohan Lyu, Chengshuai Shi, Chi Jin, Dapeng Jiang, Dawn Song, Huan-ang Gao, Huaqing Zhang, Jiantao Jiao, Jiaru Zhang, Junlin Yang, Kaicheng Yang, Kun Wang, Max Simchowitz, Qixin Xu, Runhan Huang, Shange Tang, Simon S. Du, Siqiao Huang, Wenhao Chai, Wentao Guo, Xinghan Li, Xinyang Han, Xinyue Ai, Yadi Cao, Yicheng Zhang, Yucheng Yang, Ziran Yang, Zitao Chen","cross_cats":[],"headline":"AI agents cannot reliably invent ML methods that beat human designs on generalization and scaling tests.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46Z","title":"MLS-Bench: A Holistic and Rigorous Assessment of AI Systems on Building Better AI"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.08678","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-12T01:09:27.664741Z","id":"9dee8940-caf6-4638-9924-7a9e6b7bb70e","model_set":{"reader":"grok-4.3"},"one_line_summary":"MLS-Bench shows that current AI agents fall short of reliably inventing generalizable ML methods, with engineering tuning easier than genuine invention.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"AI agents cannot reliably invent ML methods that beat human designs on generalization and scaling tests.","strongest_claim":"Current agents remain far from reliably surpassing human-designed methods, and that engineering-style tuning is easier for them than genuine method invention.","weakest_assumption":"The 140 tasks and 12 domains sufficiently capture the core skills needed for inventing generalizable and scalable ML methods without missing key aspects of real research."}},"verdict_id":"9dee8940-caf6-4638-9924-7a9e6b7bb70e"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:68e78d1d2389bbe2199dd6528622d8f7c1748a65192c76f47e4c6506bcee74e0","target":"record","created_at":"2026-05-28T01:04:41Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"27d4d2277f2bca2534f506576f1920452f62d34e2704ca2beaa2810cd04953c7","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-09T04:29:46Z","title_canon_sha256":"6faf6078a5ee082d9603732aeb78f26559ebefbd7b5f8786355978f97a3d8060"},"schema_version":"1.0","source":{"id":"2605.08678","kind":"arxiv","version":2}},"canonical_sha256":"6e5e3addb78d2fe3d3a5200035293a086f16fa3ee433f15379a80521a3e76351","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6e5e3addb78d2fe3d3a5200035293a086f16fa3ee433f15379a80521a3e76351","first_computed_at":"2026-05-28T01:04:41.928394Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-28T01:04:41.928394Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ddsg5O6JgPRRNBWRNWDvsOt1+yOlqdJstQR5szgLZ/mmgzRCJnXQ83x2jVn2yzpHjsgvVRO8fG0psC2EnHOCDA==","signature_status":"signed_v1","signed_at":"2026-05-28T01:04:41.929077Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.08678","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:68e78d1d2389bbe2199dd6528622d8f7c1748a65192c76f47e4c6506bcee74e0","sha256:22a61e557a11dc08efeb259de95faba8b21080f5ea4b93393eb4e51dd2eb410b"],"state_sha256":"b3e5b7235c3b01ec6634ba367d36f4d5a37c13cf428af7c6ef186b85ac2d9d68"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"6QibcnA4W13J/DhkomIa8vNWRo94hDeymGc6mpgTRG/ifUBiaBdOBAsQmVFJDGsrsTOTruEcmlBF23S7B9iRCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T23:13:55.742974Z","bundle_sha256":"a96d0ed71230da0b5e9a6d58ac074a6855b61cf00c95f60c8186ea2cf5d0bebd"}}