{"paper":{"title":"BioXArena: Benchmarking LLM Agents on Multi-Modal Biomedical Machine Learning Tasks","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"BioXArena tests whether LLM agents can write code to build predictive models across 76 multi-modal biomedical tasks.","cross_cats":[],"primary_cat":"cs.CE","authors_text":"Assanali Aukenov, Bin Zhang, Duzhen Zhang, Feilong Chen, Jiahua Dong, Kun Zhang, Leonard Song, Le Song, Loka Li, Noel Thomas, Shakhnazar Sailaukan, Xingbo Du, Yonghan Yang, Zixiao Wang","submitted_at":"2026-05-15T09:24:55Z","abstract_excerpt":"Large language model (LLM) agents are increasingly capable of automating components of machine learning development, yet existing biomedical benchmarks mainly focus on question answering, reasoning, and tool usage, or evaluate only narrow aspects of biomedical ML coding. We present BioXArena, a biomedical machine learning benchmark designed to evaluate whether agents can generate task-specific model training pipelines for heterogeneous and multi-modal biomedical datasets. BioXArena contains 76 end-to-end tasks across 9 domains, including sequence modeling, single-cell analysis, structural biol"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"BioXArena contains 76 end-to-end tasks across 9 domains... Agents are required to write executable code, train predictive models, and generate submissions for private test samples. MLEvolve with Gemini-3.1-Pro achieves the highest average score of 0.666, followed by GPT-5.4 with 0.636.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The 76 tasks curated from primary biomedical sources into a unified framework with hidden labels and biology-aware metrics accurately and fairly measure real-world agent performance on heterogeneous multi-modal biomedical ML problems.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"BioXArena benchmarks LLM agents on generating end-to-end ML pipelines for 76 multi-modal biomedical tasks, with MLEvolve plus Gemini-3.1-Pro scoring highest at 0.666.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"BioXArena tests whether LLM agents can write code to build predictive models across 76 multi-modal biomedical tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"df434c8a82ae92da94a69e616a78d8a09bdbb4996f76f48fc04e348f621489de"},"source":{"id":"2605.15766","kind":"arxiv","version":1},"verdict":{"id":"be7aaad4-6e23-4dff-a916-5abb22bec8fe","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T19:27:27.189594Z","strongest_claim":"BioXArena contains 76 end-to-end tasks across 9 domains... Agents are required to write executable code, train predictive models, and generate submissions for private test samples. MLEvolve with Gemini-3.1-Pro achieves the highest average score of 0.666, followed by GPT-5.4 with 0.636.","one_line_summary":"BioXArena benchmarks LLM agents on generating end-to-end ML pipelines for 76 multi-modal biomedical tasks, with MLEvolve plus Gemini-3.1-Pro scoring highest at 0.666.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The 76 tasks curated from primary biomedical sources into a unified framework with hidden labels and biology-aware metrics accurately and fairly measure real-world agent performance on heterogeneous multi-modal biomedical ML problems.","pith_extraction_headline":"BioXArena tests whether LLM agents can write code to build predictive models across 76 multi-modal biomedical tasks."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15766/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T20:01:19.175664Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T19:40:58.266744Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T17:33:48.759614Z","status":"skipped","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T17:21:55.946395Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"6e44a09310355305372f4c3f77e5e5a578c2b66987257dd52f9e7c2ba5b61415"},"references":{"count":142,"sample":[{"doi":"","year":2022,"title":"ReAct: Synergizing Reasoning and Acting in Language Models","work_id":"407a2351-25f1-497d-b611-f77d0292a8e6","ref_index":1,"cited_arxiv_id":"2210.03629","is_internal_anchor":true},{"doi":"","year":2023,"title":"AgentBench: Evaluating LLMs as Agents","work_id":"a37549b4-4c94-412d-acc4-4efeb08509be","ref_index":2,"cited_arxiv_id":"2308.03688","is_internal_anchor":true},{"doi":"","year":2024,"title":"Executable code actions elicit better llm agents","work_id":"9ba31efb-40e1-4d54-964b-41251e047488","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"arXiv:2508.02744 [cs.AI] https://arxiv.org/abs/2508.02744","work_id":"cc2b1378-83aa-4279-8bb8-e180797c1f10","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Biomni: A general-purpose biomedical ai agent.biorxiv","work_id":"c28414d4-ff79-491c-8858-6be83ec70812","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":142,"snapshot_sha256":"0052a808866a8836169199f13a7aa34549b59d25c7453e85fa7ade0fdb96585c","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"aa20c742842f0c77f62e80c14aa6dd1a0d1abaa01472c9fe0f5c5c8af2c2d778"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"}