{"total":15,"items":[{"citing_arxiv_id":"2606.31478","ref_index":133,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"One Reflection Is Not Enough: Self-Correcting Autonomous Research via Multi-Hypothesis Failure Attribution","primary_cat":"cs.AI","submitted_at":"2026-06-30T10:54:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SAGE with MHFA improves failure recovery in autonomous research agents, raising metrics-bearing outputs from 42% to 92% on a 12-topic benchmark versus single-reflection baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30246","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Clarus: Coordinating Autonomous Research Agents toward Web-Scale Scientific Collaboration","primary_cat":"cs.AI","submitted_at":"2026-06-29T12:56:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Clarus is a four-layer collaboration infrastructure with a project-agent-resource model that reformulates research as an open, traceable, multi-participant process.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20728","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VTOS: Learning to Orchestrate Vision Tools by Co-Searching Solutions and Observers","primary_cat":"cs.CV","submitted_at":"2026-06-17T04:52:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VTOS jointly searches solution and observer programs to adaptively orchestrate vision tools, outperforming static pipelines on dense object counting and zero-shot plant disease segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11926","ref_index":162,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Toward Generalist Autonomous Research via Hypothesis-Tree Refinement","primary_cat":"cs.CL","submitted_at":"2026-06-10T10:57:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Arbor combines a coordinator, executors, and a hypothesis tree to enable cumulative autonomous research, outperforming Codex and Claude Code by over 2.5x on six real tasks and reaching 86.36% Any Medal on MLE-Bench Lite.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06473","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MLEvolve: A Self-Evolving Framework for Automated Machine Learning Algorithm Discovery","primary_cat":"cs.AI","submitted_at":"2026-06-04T17:55:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MLEvolve is a self-evolving multi-agent LLM system with Progressive MCGS, Retrospective Memory, and adaptive coding modes that reports SOTA medal and submission rates on MLE-Bench under a 12-hour budget while outperforming AlphaEvolve on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05250","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards Persistent Case-Based Memory for Autonomous Data Science: A CBR-Augmented R&D-Agent with a Locally Deployable Small Language Model","primary_cat":"cs.SE","submitted_at":"2026-06-03T12:56:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CBR integration into R&D-Agent with Gemma 4 31B yields directionally higher accuracy and lower variance than baseline on one of two Kaggle competitions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27873","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AIBuildAI-2: A Knowledge-Enhanced Agent for Automatically Building AI Models","primary_cat":"cs.AI","submitted_at":"2026-05-27T02:44:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AIBuildAI-2 introduces a knowledge-enhanced agent with a hierarchical evolving external knowledge base that dynamically loads relevant AI development expertise, achieving first place on MLE-Bench at 70.7% medal rate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20086","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"What Do Evolutionary Coding Agents Evolve?","primary_cat":"cs.NE","submitted_at":"2026-05-19T16:41:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Evolutionary coding agents achieve most benchmark gains through a small subset of edit types and by cycling previously deleted code lines rather than developing new algorithmic structures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18661","ref_index":234,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AI for Auto-Research: Roadmap & User Guide","primary_cat":"cs.AI","submitted_at":"2026-05-18T17:08:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper delivers a stage-by-stage roadmap for AI in research, showing reliable assistance in retrieval and tool tasks but fragility in novelty and judgment, advocating human-governed collaboration.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"DeepPresenter [262] conditions revision on rendered slide images rather than only internal reasoning traces, showing that visual feedback is important for presentation quality. 28 Multi-agentandinteractivesystemsfurtherdecomposeslidegenerationintospecializedsubtasks. SlideGen[ 111] uses agents for outlining, content mapping, arrangement, note synthesis, and iterative refinement to produce editable PPTX slides. Auto-Slides [234] targets Beamer generation with multi-agent collaboration and interactive editing. SlideTailor [247] conditions generation on user preference from a single example pair using a chain-of-speech mechanism. Other systems focus on task-specific capabilities: PASS [3] combines slide generation with AI audio delivery, AutoPresent [48] fine-tunes a slide-generation model on SlidesBench,"},{"citing_arxiv_id":"2604.17406","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EvoMaster: A Foundational Evolving Agent Framework for Agentic Science at Scale","primary_cat":"cs.AI","submitted_at":"2026-04-19T12:26:05+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14455","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AIBuildAI: An AI Agent for Automatically Building AI Models","primary_cat":"cs.AI","submitted_at":"2026-04-15T22:17:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AIBuildAI uses a manager agent and three LLM sub-agents to fully automate AI model development and achieves a 63.1% medal rate on MLE-Bench, matching experienced human engineers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13018","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Toward Autonomous Long-Horizon Engineering for ML Research","primary_cat":"cs.CL","submitted_at":"2026-04-14T17:55:16+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.23986","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TusoAI: Agentic Optimization for Scientific Methods","primary_cat":"cs.AI","submitted_at":"2025-09-28T17:30:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TusoAI is an LLM-based agent that builds and iteratively optimizes domain-specific computational methods for scientific data analysis, outperforming expert baselines on RNA-seq denoising and earth monitoring while reporting new genetic associations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.06806","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MachineLearningLM: Scaling Many-shot In-context Learning via Continued Pretraining","primary_cat":"cs.CL","submitted_at":"2025-09-08T15:38:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MachineLearningLM uses continued pretraining on SCM-synthesized ML tasks with random-forest distillation to give LLMs robust many-shot in-context learning on tabular classification, reaching random-forest accuracy levels while preserving general chat performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.10177","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"KompeteAI: Accelerated Autonomous Multi-Agent System for End-to-End Pipeline Generation for Machine Learning Problems","primary_cat":"cs.AI","submitted_at":"2025-08-13T20:29:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"KompeteAI accelerates AutoML pipeline evaluation 6.9 times and beats prior systems by 3% on MLE-Bench through candidate merging, external RAG, and predictive early scoring.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}