{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PYGVHWY3S7OVTGY56PZPZOZP4Y","short_pith_number":"pith:PYGVHWY3","schema_version":"1.0","canonical_sha256":"7e0d53db1b97dd599b1df3f2fcbb2fe60405ded792577748006c319425c2676b","source":{"kind":"arxiv","id":"2605.16616","version":1},"attestation_state":"computed","paper":{"title":"MLReplicate: Benchmarking Autonomous Research Systems for Machine Learning Reproducibility","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Diyana Muhammed, Farhana Keya, Gollam Rabby, Sasi Kiran Gaddipati, S\\\"oren Auer","submitted_at":"2026-05-15T20:35:32Z","abstract_excerpt":"Autonomous research systems capable of generating complete scientific manuscripts have advanced rapidly, yet robust and realistic evaluation frameworks have failed to keep pace. To bridge this gap, we introduce MLReplicate, an end-to-end benchmark evaluating autonomous research systems on machine learning reproducibility. The benchmark was constructed from ICML 2025 outstanding papers reformulated into standardized input specifications and evaluated across 6 state-of-the-art research systems: AI SCIENTIST-V1, AI SCIENTIST-V2, AGENT LABORATORY, CYCLERESEARCHER, AI RESEARCHER, and TINY SCIENTIST"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.16616","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-15T20:35:32Z","cross_cats_sorted":[],"title_canon_sha256":"612be3bc31291556324c024c4c6ac5452c3dce269cc79881d377deb0bd733dec","abstract_canon_sha256":"11c9552c63d3a2d77f98537e85d15da516848cb5c6374b25c24fc00a79cd1e0b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:32.621735Z","signature_b64":"0pErT6ceXvgje3dSaJiXKG8TaguZmUFnnKLXgNo80fJltXNNyygc1aXhTmlxwwFlEYyfbckpITGrGu+ll3n6BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7e0d53db1b97dd599b1df3f2fcbb2fe60405ded792577748006c319425c2676b","last_reissued_at":"2026-05-20T00:02:32.620922Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:32.620922Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MLReplicate: Benchmarking Autonomous Research Systems for Machine Learning Reproducibility","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Diyana Muhammed, Farhana Keya, Gollam Rabby, Sasi Kiran Gaddipati, S\\\"oren Auer","submitted_at":"2026-05-15T20:35:32Z","abstract_excerpt":"Autonomous research systems capable of generating complete scientific manuscripts have advanced rapidly, yet robust and realistic evaluation frameworks have failed to keep pace. To bridge this gap, we introduce MLReplicate, an end-to-end benchmark evaluating autonomous research systems on machine learning reproducibility. The benchmark was constructed from ICML 2025 outstanding papers reformulated into standardized input specifications and evaluated across 6 state-of-the-art research systems: AI SCIENTIST-V1, AI SCIENTIST-V2, AGENT LABORATORY, CYCLERESEARCHER, AI RESEARCHER, and TINY SCIENTIST"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.16616","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.16616/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-19T19:21:56.780155Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T18:33:26.590805Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"67a0e49f81711bdfa644a87e14c4b8e77110527420f633a4aae03effe91542d3"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.16616","created_at":"2026-05-20T00:02:32.621051+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.16616v1","created_at":"2026-05-20T00:02:32.621051+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16616","created_at":"2026-05-20T00:02:32.621051+00:00"},{"alias_kind":"pith_short_12","alias_value":"PYGVHWY3S7OV","created_at":"2026-05-20T00:02:32.621051+00:00"},{"alias_kind":"pith_short_16","alias_value":"PYGVHWY3S7OVTGY5","created_at":"2026-05-20T00:02:32.621051+00:00"},{"alias_kind":"pith_short_8","alias_value":"PYGVHWY3","created_at":"2026-05-20T00:02:32.621051+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y","json":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y.json","graph_json":"https://pith.science/api/pith-number/PYGVHWY3S7OVTGY56PZPZOZP4Y/graph.json","events_json":"https://pith.science/api/pith-number/PYGVHWY3S7OVTGY56PZPZOZP4Y/events.json","paper":"https://pith.science/paper/PYGVHWY3"},"agent_actions":{"view_html":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y","download_json":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y.json","view_paper":"https://pith.science/paper/PYGVHWY3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.16616&json=true","fetch_graph":"https://pith.science/api/pith-number/PYGVHWY3S7OVTGY56PZPZOZP4Y/graph.json","fetch_events":"https://pith.science/api/pith-number/PYGVHWY3S7OVTGY56PZPZOZP4Y/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y/action/storage_attestation","attest_author":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y/action/author_attestation","sign_citation":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y/action/citation_signature","submit_replication":"https://pith.science/pith/PYGVHWY3S7OVTGY56PZPZOZP4Y/action/replication_record"}},"created_at":"2026-05-20T00:02:32.621051+00:00","updated_at":"2026-05-20T00:02:32.621051+00:00"}