{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:H5E4RBVVO73D2O55WUITJ4EDRU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f5fb737c5d0d23c7b616c2709def8648acee758a13af530cf04429ab1ad9f46c","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-02-18T18:57:21Z","title_canon_sha256":"12ed0c321dfb54715b553cf42ff7c0ef45beb36bcedadc170f2de58489f2b47a"},"schema_version":"1.0","source":{"id":"2502.13138","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.13138","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2502.13138v1","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.13138","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"H5E4RBVVO73D","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"H5E4RBVVO73D2O55","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"H5E4RBVV","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:4c7dd8d821e636b0cbb2c7368c004c09e341f36c7b3802a610678bc17dc5e75f","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":3,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"By strategically reusing and refining promising solutions, AIDE effectively trades computational resources for enhanced performance, achieving state-of-the-art results on multiple machine learning engineering benchmarks, including our Kaggle evaluations, OpenAI MLE-Bench and METRs RE-Bench."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the tree search guided by LLMs can reliably identify and improve upon promising code variants without the search space becoming intractable or the evaluations becoming unreliable."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"AIDE uses large language models to perform tree search in code space and reaches state-of-the-art results on Kaggle, OpenAI MLE-Bench, and METR RE-Bench."}],"snapshot_sha256":"a7a736928e39dd1d21318f06b558abc94b39c80ce541c3a3920bfa620a4dd389"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"c0441559ff1bf371acd3626b67e14ca2eb2fc1ccc17cfc5cba518557fcd889d9"},"paper":{"abstract_excerpt":"Machine learning, the foundation of modern artificial intelligence, has driven innovations that have fundamentally transformed the world. Yet, behind advancements lies a complex and often tedious process requiring labor and compute intensive iteration and experimentation. Engineers and scientists developing machine learning models spend much of their time on trial-and-error tasks instead of conceptualizing innovative solutions or research hypotheses. To address this challenge, we introduce AI-Driven Exploration (AIDE), a machine learning engineering agent powered by large language models (LLMs","authors_text":"Deniss Jacenko, Dhruv Srikanth, Dixing Xu, Dominik Schmidt, Ian Kaplan, Yuxiang WU, Zhengyao Jiang","cross_cats":["cs.LG"],"headline":"AIDE uses large language models to perform tree search in code space and reaches state-of-the-art results on Kaggle, OpenAI MLE-Bench, and METR RE-Bench.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-02-18T18:57:21Z","title":"AIDE: AI-Driven Exploration in the Space of Code"},"references":{"count":14,"internal_anchors":1,"resolved_work":14,"sample":[{"cited_arxiv_id":"","doi":"10.1126/science.abq1158","is_internal_anchor":false,"ref_index":1,"title":"Li, Y., Choi, D.H., Chung, J., Kushman, N., Schrittwieser, J., Leblond, R., et al., 2022","work_id":"cc452f34-3d34-41ff-9206-8edad6625ce6","year":2019},{"cited_arxiv_id":"2305.16291","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Voyager: An Open-Ended Embodied Agent with Large Language Models","work_id":"ffe0d207-86cf-4742-a100-e988ac8b9676","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Distributed Random Forest (DRF) and Extremely Randomized Trees (XRT)","work_id":"db2145b6-e1e8-4ce3-a44c-c900ebb7293f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Generalized Linear Model (GLM) with regularization","work_id":"f095d396-55ea-4ee6-b3d3-89474fcae80e","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"H2O Gradient Boosting Machines","work_id":"89aee919-4405-4246-bae7-3854966a8126","year":null}],"snapshot_sha256":"098c479e261841ca43a655028e78ffc90d602a830a9ea7264f463b467e7ac2fc"},"source":{"id":"2502.13138","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-17T18:16:23.603996Z","id":"7ba5a638-9b0b-4040-9396-88e88438a4cd","model_set":{"reader":"grok-4.3"},"one_line_summary":"AIDE uses large language models to perform tree search in code space and reaches state-of-the-art results on Kaggle, OpenAI MLE-Bench, and METR RE-Bench.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"","strongest_claim":"By strategically reusing and refining promising solutions, AIDE effectively trades computational resources for enhanced performance, achieving state-of-the-art results on multiple machine learning engineering benchmarks, including our Kaggle evaluations, OpenAI MLE-Bench and METRs RE-Bench.","weakest_assumption":"That the tree search guided by LLMs can reliably identify and improve upon promising code variants without the search space becoming intractable or the evaluations becoming unreliable."}},"verdict_id":"7ba5a638-9b0b-4040-9396-88e88438a4cd"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a3bbe84dceed0dc9638d7f313d1211617cd43705a1e4c48ee547f977c19f15e7","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f5fb737c5d0d23c7b616c2709def8648acee758a13af530cf04429ab1ad9f46c","cross_cats_sorted":["cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-02-18T18:57:21Z","title_canon_sha256":"12ed0c321dfb54715b553cf42ff7c0ef45beb36bcedadc170f2de58489f2b47a"},"schema_version":"1.0","source":{"id":"2502.13138","kind":"arxiv","version":1}},"canonical_sha256":"3f49c886b577f63d3bbdb51134f0838d18b7c4e248340dd84ac6f9815680cecb","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3f49c886b577f63d3bbdb51134f0838d18b7c4e248340dd84ac6f9815680cecb","first_computed_at":"2026-05-17T23:38:13.410583Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.410583Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"R59m3jBOtLNMdT0Wq6Ir2u42hlO8hyE8UD8qP4oWdUza/WiInzRzDY3iVUOitnqk5M70tNAl21jLhxLDn3MOBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.411341Z","signed_message":"canonical_sha256_bytes"},"source_id":"2502.13138","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a3bbe84dceed0dc9638d7f313d1211617cd43705a1e4c48ee547f977c19f15e7","sha256:4c7dd8d821e636b0cbb2c7368c004c09e341f36c7b3802a610678bc17dc5e75f"],"state_sha256":"7f2e471ae60d48501ad30d109e53031e72dd38af2608beb48cced47dc4d31a2f"}