{"total":17,"items":[{"citing_arxiv_id":"2605.15184","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Is Grep All You Need? How Agent Harnesses Reshape Agentic Search","primary_cat":"cs.CL","submitted_at":"2026-05-14T17:58:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Grep retrieval generally outperforms vector retrieval in agentic search tasks, with performance varying strongly by agent harness and tool-calling style.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09544","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TIDE-Bench: Task-Aware and Diagnostic Evaluation of Tool-Integrated Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-10T13:56:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TIDE-Bench is a new benchmark for tool-integrated reasoning that combines diverse tasks, multi-aspect metrics covering answer quality, process reliability, efficiency and cost, plus filtered challenging test sets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09038","ref_index":24,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SearchSkill: Teaching LLMs to Use Search Tools with Evolving Skill Banks","primary_cat":"cs.AI","submitted_at":"2026-05-09T16:23:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SearchSkill improves exact match scores and retrieval efficiency on open-domain QA by conditioning LLM actions on skills from an evolving SkillBank updated from failure patterns via two-stage SFT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07675","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"FactoryBench: Evaluating Industrial Machine Understanding","primary_cat":"cs.AI","submitted_at":"2026-05-08T12:47:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FactoryBench reveals that frontier LLMs achieve under 50% on structured causal questions and under 18% on decision-making in industrial robotic telemetry.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00060","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TADI: Tool-Augmented Drilling Intelligence via Agentic LLM Orchestration over Heterogeneous Wellsite Data","primary_cat":"cs.AI","submitted_at":"2026-04-30T03:19:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TADI shows that domain-specialized tools orchestrated by an LLM over dual structured and semantic databases can convert heterogeneous wellsite data into evidence-grounded drilling intelligence, with tool design mattering more than model scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.13958","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ToolRL: Reward is All Tool Learning Needs","primary_cat":"cs.LG","submitted_at":"2025-04-16T21:45:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A principled reward design for tool selection and application in RL-trained LLMs delivers 17% gains over base models and 15% over SFT across benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.23218","ref_index":99,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OS-ATLAS: A Foundation Action Model for Generalist GUI Agents","primary_cat":"cs.CL","submitted_at":"2024-10-30T17:10:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OS-Atlas, trained on the largest open-source cross-platform GUI grounding corpus of 13 million elements, outperforms prior open-source models on six benchmarks across mobile, desktop, and web platforms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.13501","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Survey on the Memory Mechanism of Large Language Model based Agents","primary_cat":"cs.AI","submitted_at":"2024-04-21T01:49:46+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A systematic review of memory designs, evaluation methods, applications, limitations, and future directions for LLM-based agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.17297","ref_index":155,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"InternLM2 Technical Report","primary_cat":"cs.CL","submitted_at":"2024-03-26T00:53:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InternLM2 is a new open-source LLM that outperforms prior versions on 30 benchmarks and long-context tasks through scaled pre-training to 32k tokens and a conditional online RLHF alignment strategy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.02716","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Understanding the planning of LLM agents: A survey","primary_cat":"cs.AI","submitted_at":"2024-02-05T04:25:24+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A survey that provides a taxonomy of methods for improving planning in LLM-based agents across task decomposition, plan selection, external modules, reflection, and memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.10774","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads","primary_cat":"cs.LG","submitted_at":"2024-01-19T15:48:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Medusa augments LLMs with multiple decoding heads and tree-based attention to predict and verify several tokens in parallel, yielding 2.2-3.6x inference speedup via two fine-tuning regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.07864","ref_index":95,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Rise and Potential of Large Language Model Based Agents: A Survey","primary_cat":"cs.AI","submitted_at":"2023-09-14T17:12:03+00:00","verdict":"ACCEPT","verdict_confidence":"HIGH","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper surveys the origins, frameworks, applications, and open challenges of AI agents built on large language models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"primary component of brain or controller of these agents and expand their perceptual and action space through strategies such as multimodal perception and tool utilization [90; 91; 92; 93; 94]. These LLM- based agents can exhibit reasoning and planning abilities comparable to symbolic agents through techniques like Chain-of-Thought (CoT) and problem decomposition [95; 96; 97; 98; 99; 100; 101]. They can also acquire interactive capabilities with the environment, akin to reactive agents, by learning from feedback and performing new actions [ 102; 103; 104]. Similarly, large language models undergo pre-training on large-scale corpora and demonstrate the capacity for few-shot and zero-shot generalization, allowing for seamless transfer between tasks without the need to update"},{"citing_arxiv_id":"2309.01219","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Siren's Song in the AI Ocean: A Survey on Hallucination in Large Language Models","primary_cat":"cs.CL","submitted_at":"2023-09-03T16:56:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A literature survey that taxonomizes hallucination phenomena in LLMs, reviews evaluation benchmarks, and analyzes approaches for their detection, explanation, and mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2308.11432","ref_index":151,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Survey on Large Language Model based Autonomous Agents","primary_cat":"cs.AI","submitted_at":"2023-08-22T13:30:37+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A survey of LLM-based autonomous agents that proposes a unified framework for their construction and reviews applications in social science, natural science, and engineering along with evaluation methods and future directions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2306.06070","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Mind2Web: Towards a Generalist Agent for the Web","primary_cat":"cs.CL","submitted_at":"2023-06-09T17:44:31+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Mind2Web is the first large-scale dataset of real-world web tasks for developing generalist language-guided agents that complete complex actions on diverse websites.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.18323","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models","primary_cat":"cs.CL","submitted_at":"2023-05-23T00:16:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReWOO decouples reasoning from tool observations in augmented language models, delivering 5x token efficiency and 4% higher accuracy on multi-step reasoning benchmarks like HotpotQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2304.15010","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model","primary_cat":"cs.CV","submitted_at":"2023-04-28T17:59:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLaMA-Adapter V2 achieves open-ended visual instruction following in LLMs by unlocking more parameters, early fusion of visual tokens, and joint training on disjoint parameter groups with only 14M added parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}