{"total":13,"items":[{"citing_arxiv_id":"2605.20176","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:58:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClinSeekAgent automates active multimodal evidence seeking for clinical reasoning, improving LLM performance on raw EHR and CXR tasks while enabling distillation into smaller models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18570","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Query-Conditioned Knowledge Alignment for Reliable Cross-System Medical Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-18T15:49:46+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"QCEA reformulates entity alignment as a query-conditioned ranking task with semantic encoding, graph learning, and direction-aware transformation to handle context-dependent, asymmetric correspondences in medical knowledge graphs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10761","ref_index":109,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RadThinking: A Dataset for Longitudinal Clinical Reasoning in Radiology","primary_cat":"cs.CV","submitted_at":"2026-05-11T15:57:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RadThinking releases a large longitudinal CT VQA dataset stratified into foundation perception questions, single-rule reasoning questions, and compositional multi-step chains grounded in clinical reporting standards for cancer screening.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Medreason: Eliciting factual medical reasoning steps in llms via knowledge graphs.arXiv preprint arXiv:2504.00993, 2025. 16 [108] Z. Wu, X. Chen, Z. Pan, X. Liu, W. Liu, D. Dai, H. Gao, Y . Ma, C. Wu, B. Wang, et al. DeepSeek-VL2: Mixture-of-experts vision-language models for advanced multimodal under- standing.arXiv preprint arXiv:2412.10302, 2024. [109] Y . Xia, Q. Yu, L. Chu, S. Kawamoto, S. Park, F. Liu, J. Chen, Z. Zhu, B. Li, Z. Zhou, A. L. Yuille, E. K. Fishman, and R. H. Hruban. The felix project: Deep networks to detect pancreatic neoplasms.medRxiv, 2022. [110] G. Xu, P. Jin, H. Li, Y . Song, L. Sun, and L. Yuan. LLaV A-CoT: Let vision language models reason step-by-step.arXiv preprint arXiv:2411."},{"citing_arxiv_id":"2605.09584","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CLR-voyance: Reinforcing Open-Ended Reasoning for Inpatient Clinical Decision Support with Outcome-Aware Rubrics","primary_cat":"cs.CL","submitted_at":"2026-05-10T14:51:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLR-voyance reformulates inpatient reasoning as POMDP with clinician-validated outcome rubrics, yielding an 8B model that outperforms larger frontier models on the authors' new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09505","ref_index":19,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"EpiGraph: Building Generalists for Evidence-Intensive Epilepsy Reasoning in the Wild","primary_cat":"cs.AI","submitted_at":"2026-05-10T12:27:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EpiGraph creates a heterogeneous epilepsy knowledge graph that boosts LLM performance on clinical reasoning tasks by 30-41% in pharmacogenomics when used with Graph-RAG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06177","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BioMedArena: An Open-source Toolkit for Building and Evaluating Biomedical Deep Research Agents","primary_cat":"cs.AI","submitted_at":"2026-05-07T12:57:18+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BioMedArena releases a standardized toolkit with 147 biomedical benchmarks, 75 tools, and six harnesses that achieve SOTA results on eight tasks with a +15.03 percentage point average lift.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01474","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReMedi: Reasoner for Medical Clinical Prediction","primary_cat":"cs.CL","submitted_at":"2026-05-02T14:44:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ReMedi boosts LLM performance on EHR clinical predictions by up to 19.9% F1 through ground-truth-guided rationale regeneration and fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26283","ref_index":52,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MedSynapse-V: Bridging Visual Perception and Clinical Intuition via Latent Memory Evolution","primary_cat":"cs.CV","submitted_at":"2026-04-29T04:23:35+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"counterfactual refinement, and autonomous latent memory internalization. that enables near-instantaneous pattern recognition against accumulated case knowledge [4,35,48]. Although medical vision-language models (VLMs) have made substantial progress in diagnostic assistance [6,24,31,41,51], with rein- forcement learning from verifiable rewards [19,37,44,45] and chain-of-thought (CoT)[7,12,21,49,50,52]furtheradvancingreasoningcapabilities.However,their intrinsic reliance on discrete tokens engenders a profoundCognitive Misalign- mentwith the inherently continuous nature of clinical expertise. As illustrated in Fig. 1, the limited granularity of a fixed vocabulary is inadequate for rep- resenting continuous pathological features such as gradual transitions in lesion"},{"citing_arxiv_id":"2604.23356","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VeriLLMed: Interactive Visual Debugging of Medical Large Language Models with Knowledge Graphs","primary_cat":"cs.CL","submitted_at":"2026-04-25T15:46:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VeriLLMed is an interactive visual debugging tool that maps LLM diagnostic reasoning to knowledge graphs to identify and categorize relation, branch, and missing errors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15456","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepER-Med: Advancing Deep Evidence-Based Research in Medicine Through Agentic AI","primary_cat":"cs.AI","submitted_at":"2026-04-16T18:17:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DeepER-Med introduces a three-module agentic AI workflow for evidence-based medical research that outperforms production platforms on a new expert-curated dataset of 100 questions and matches clinical recommendations in seven of eight real-world cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.27820","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Improving Clinical Diagnosis with Counterfactual Multi-Agent Reasoning","primary_cat":"cs.CL","submitted_at":"2026-03-29T19:14:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new counterfactual multi-agent framework improves LLM diagnostic accuracy by quantifying confidence shifts from edited clinical findings and guiding specialist discussions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08559","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Medical Reasoning with Large Language Models: A Survey and MR-Bench","primary_cat":"cs.CL","submitted_at":"2026-03-17T09:03:09+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLMs show strong exam performance on medical tasks but exhibit a clear gap in accuracy on authentic clinical decision-making as measured by the new MR-Bench benchmark and unified evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.23330","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Structured In-context Environment Scaling for Large Language Model Reasoning","primary_cat":"cs.CL","submitted_at":"2025-09-27T14:34:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SIE framework automatically constructs scalable, verifiable reasoning environments from structured data, improving in-domain performance and enabling generalization to out-of-domain math and logic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}