{"total":25,"items":[{"citing_arxiv_id":"2606.28900","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MedEvoEval: Evaluating Continual Evolution of Doctor Agents through Simulated Clinical Episodes","primary_cat":"cs.AI","submitted_at":"2026-06-27T13:14:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedEvoEval is an executable longitudinal evaluation framework that converts medical cases into action-gated simulated episodes to track how doctor agents evolve decision-making, resource use, and experience across multiple encounters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29368","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SURGENT: A Surgical Multi-Agent Assistance System Across the Perioperative Workflow","primary_cat":"cs.CL","submitted_at":"2026-05-28T05:12:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SURGENT is a multi-agent surgical assistance system with novel memory management that outperforms baseline LLMs on case analysis, plan simulation, safety monitoring, risk assessment, and rehabilitation guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28332","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Medical Safety Alignment Fails: A Benchmark for Evaluating LLMs on High-Risk Medical Queries","primary_cat":"cs.CY","submitted_at":"2026-05-26T14:39:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MedHarm benchmark shows aligned LLMs and guardrails can still produce unsafe responses on high-risk medical queries, indicating medical safety requires domain-specific testing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25878","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Clinically Validated Foundation Model for Comprehensive Lung Pathology Interpretation","primary_cat":"eess.IV","submitted_at":"2026-05-25T14:04:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PulmoFoundation achieves 92.3% average AUC on 32 lung pathology tasks in prospective validation and raises pathologist accuracy from 83.8% to 91.7% in a crossover RCT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20525","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NeuroQA: A Large-Scale Image-Grounded Benchmark for 3D Brain MRI Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-19T21:54:12+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"NeuroQA is a large-scale 3D brain MRI visual question answering benchmark with verified image-grounded QA pairs, multi-domain coverage, and baseline evaluations showing current models lag behind text-only performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20425","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentCo-op: Retrieval-Based Synthesis of Interoperable Multi-Agent Workflows","primary_cat":"cs.AI","submitted_at":"2026-05-19T19:22:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AgentCo-op retrieves and assembles existing agents and tools into interoperable workflows for open-world scientific tasks, showing effectiveness in genomics case studies and competitive benchmark results with lower costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16630","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PrivScope: Task-scoped Disclosure Control for Hybrid Agentic Systems","primary_cat":"cs.CR","submitted_at":"2026-05-15T20:53:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrivScope enforces task-scoped disclosure at the local-cloud boundary in hybrid agents, eliminating profile leakage and halving re-identification risk on medical workflows while preserving task success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10286","ref_index":45,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentRx: A Benchmark Study of LLM Agents for Multimodal Clinical Prediction Tasks","primary_cat":"cs.AI","submitted_at":"2026-05-11T09:46:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Single-agent LLM frameworks outperform naive multi-agent systems in multimodal clinical risk prediction tasks and are better calibrated.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08813","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentSlimming: Towards Efficient and Cost-Aware Multi-Agent Systems","primary_cat":"cs.LG","submitted_at":"2026-05-09T09:03:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AgentSlimming compresses graph-structured multi-agent systems by estimating agent importance and removing or replacing low-value agents, cutting token costs by up to 78.9% with negligible performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05715","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Decodable but Not Corrected by Fixed Residual-Stream Linear Steering: Evidence from Medical LLM Failure Regimes","primary_cat":"cs.AI","submitted_at":"2026-05-07T05:58:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Overthinking in medical QA is linearly decodable at 71.6% accuracy yet fixed residual-stream steering yields no correction across 29 configurations, while enabling selective abstention with AUROC 0.610.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04012","ref_index":6,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SymptomAI: Toward a Conversational AI Agent for Everyday Symptom Assessment","primary_cat":"cs.AI","submitted_at":"2026-05-05T17:36:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Large real-world deployment found conversational AI agents for everyday symptom assessment more accurate than clinicians and improved by structured interviewing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15203","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MADE: A Living Benchmark for Multi-Label Text Classification with Uncertainty Quantification of Medical Device Adverse Events","primary_cat":"cs.CL","submitted_at":"2026-04-16T16:28:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MADE creates a contamination-resistant living benchmark for multi-label classification of medical device adverse events, with evaluations revealing model-specific trade-offs in accuracy and uncertainty quantification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10535","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evaluating Small Open LLMs for Medical Question Answering: A Practical Framework","primary_cat":"cs.IR","submitted_at":"2026-04-12T08:56:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Small open LLMs produce highly variable medical answers even at low temperature, with self-agreement at most 0.20 and 87-97% unique outputs per model across 10 runs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08559","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Medical Reasoning with Large Language Models: A Survey and MR-Bench","primary_cat":"cs.CL","submitted_at":"2026-03-17T09:03:09+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs show strong exam performance on medical tasks but exhibit a clear gap in accuracy on authentic clinical decision-making as measured by the new MR-Bench benchmark and unified evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.05308","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Med-V1: Small Language Models for Zero-shot and Scalable Biomedical Evidence Attribution","primary_cat":"cs.CL","submitted_at":"2026-03-05T15:48:43+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.24186","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Measuring Competency, Not Performance: Item-Aware Evaluation Across Medical Benchmarks","primary_cat":"cs.CL","submitted_at":"2025-09-29T02:06:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MedIRT applies Item Response Theory to medical LLM benchmarks to separate latent competency from item difficulty and discrimination, producing more stable rankings and revealing domain heterogeneity than accuracy alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.05012","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Making Prompts First-Class Citizens for Adaptive LLM Pipelines","primary_cat":"cs.DB","submitted_at":"2025-08-07T03:49:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SPEAR proposes structured prompt views, runtime adaptive refinement, and policy rules to make prompts first-class, versioned, and evolvable components in complex LLM applications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.01990","ref_index":183,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Advances and Challenges in Foundation Agents: From Brain-Inspired Intelligence to Evolutionary, Collaborative, and Safe Systems","primary_cat":"cs.AI","submitted_at":"2025-03-31T18:00:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"This survey frames foundation agents using brain-inspired modular architectures and reviews challenges in evolution, collaboration, and safety.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"their logical deduction capabilities to solve problems through flexible reasoning processes. Chain-of-Thought VariantsThe cornerstone of prompting-based reasoning is Chain-of-Thought (CoT) prompting [73], which operationalizes reasoning through few-shot examples with explicit generation of intermediate rationalization steps. This foundational technique has inspired several evolutionary variants thatenhanceitsbasicapproach. Zero-shotCoT[ 183]eliminatestheneedfordemonstrationexamplesthrough strategicprompting(e.g.,\"Let'sthinkstepbystep\"),makingtheapproachmoreaccessiblewhilemaintaining effectiveness. Auto-CoT [184] automates the creation of effective demonstrations by clustering diverse questions and generating reasoning chains for representative examples from each cluster. Least-to-Most"},{"citing_arxiv_id":"2502.07143","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ask Patients with Patience: Enabling LLMs for Human-Centric Medical Dialogue with Grounded Reasoning","primary_cat":"cs.CL","submitted_at":"2025-02-11T00:13:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"APP is a multi-turn LLM framework for medical dialogue that combines empathetic questioning, Bayesian active learning, and guideline-based reasoning, outperforming baselines on a new simulated-patient benchmark in accuracy, uncertainty reduction, and user experience.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.18925","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HuatuoGPT-o1, Towards Medical Complex Reasoning with LLMs","primary_cat":"cs.CL","submitted_at":"2024-12-25T15:12:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HuatuoGPT-o1 achieves superior medical complex reasoning by using a verifier to curate reasoning trajectories for fine-tuning and then applying RL with verifier-based rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.21276","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"GPT-4o System Card","primary_cat":"cs.CL","submitted_at":"2024-10-25T17:43:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GPT-4o is OpenAI's end-to-end multimodal model with human-like audio latency, improved non-English text performance, stronger vision and audio understanding, and accompanying safety evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.07496","ref_index":66,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TextGrad: Automatic \"Differentiation\" via Text","primary_cat":"cs.CL","submitted_at":"2024-06-11T17:32:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TextGrad performs automatic differentiation for compound AI systems by backpropagating natural-language feedback from LLMs to optimize variables ranging from code to molecular structures.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"We report the averages across five plans and with standard deviation included in the bracket. 4 Related work One related thread of work investigated the problem of prompt optimization. Practitioners demonstrated that prompt engineering strategies such as intelligently picking few-shot examples and in-context learning, CoT, ensembles can significantly boost performance of LLMs [66]. To automate this process, white-box methods that leverage numerical gradients were developed to optimize prompts [67-70], however, these methods cannot be used with closed-source models as they require access to model parameters. Various works investigated using LLMs as prompt optimizers [12, 25, 71]. Under prompt optimization, there are two works closest to our philosophy that have been our inspi-"},{"citing_arxiv_id":"2405.07960","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AgentClinic: a multimodal agent benchmark to evaluate AI in simulated clinical environments","primary_cat":"cs.HC","submitted_at":"2024-05-13T17:38:53+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AgentClinic is a multimodal agent benchmark demonstrating that LLM diagnostic accuracy on MedQA drops to below one-tenth in sequential clinical simulations, with Claude-3.5 leading and large tool-use differences across models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.18416","ref_index":179,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Capabilities of Gemini Models in Medicine","primary_cat":"cs.AI","submitted_at":"2024-04-29T04:11:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Med-Gemini sets new records on 10 of 14 medical benchmarks including 91.1% on MedQA-USMLE, beats GPT-4V by 44.5% on multimodal tasks, and surpasses humans on medical text summarization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.07345","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Can an LLM Learn Preferences from Choice Data?","primary_cat":"econ.GN","submitted_at":"2024-01-14T19:05:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs show improving recommendation accuracy with more observed choices under the disappointment aversion model, but learning success is heterogeneous across models and preference parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}