{"total":145,"items":[{"citing_arxiv_id":"2606.22495","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounded Scaling: Why Agentic AI Needs Deterministic Environments","primary_cat":"cs.AI","submitted_at":"2026-06-21T13:34:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Agentic AI scaling requires deterministic environments because per-step success probability below 1 causes exponential degradation in k-step chains, addressed via new metrics SCI and DMM plus formal bounds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10106","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What makes a harness a harness: necessary and sufficient conditions for an agent harness","primary_cat":"cs.SE","submitted_at":"2026-06-08T19:35:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes and tests a constitutive definition of 'agent harness' via conceptual analysis of literature and six real systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05976","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Self-Correction Illusion: LLMs Correct Others but Not Themselves","primary_cat":"cs.AI","submitted_at":"2026-06-04T10:17:00+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Relabeling an identical erroneous claim from the model's own thought role to an external chat role increases explicit correction rates by 23-93 percentage points across 13 model-domain cells, indicating a chat-template artifact rather than a cognitive deficit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01188","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"pcbGPT: Automatic PCB Schematic Synthesis from Natural Language Requirements","primary_cat":"cs.HC","submitted_at":"2026-05-31T12:07:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"pcbGPT generates KiCad schematics from natural language with 0.90 pass@1 on 20 embedded tasks via tool-augmented LLM synthesis and multi-stage validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00603","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Agentic Governance: What Shapes LLM-Agent Intervention in Public Forums?","primary_cat":"cs.CY","submitted_at":"2026-05-30T08:01:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Four deployment choices—model version, open/closed weight status, provider, and system prompt—each alter LLM-agent intervention rates on forum posts, with closed-weight models declining more on visible challenges than open-weight models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00579","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sandboxed Coding Agents are Competitive Omni-modal Task Solvers","primary_cat":"cs.CL","submitted_at":"2026-05-30T07:04:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sandboxed coding agents with text+image access match or outperform native omnimodal models on audio-video benchmarks by converting tasks into code-driven retrieval and processing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31408","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill Availability and Presentation Granularity in Large-Language-Model Agents: A Controlled SkillsBench Study","primary_cat":"cs.CL","submitted_at":"2026-05-29T15:12:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"In a 30-task SkillsBench study, skill availability boosts GPT-5.5 and DeepSeek V4-Flash agent pass rates substantially while presentation-granularity variations yield small uncertain effects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30862","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sophrosyne: Agentic Exploration of Relational Data Systems Needs Moderation","primary_cat":"cs.DB","submitted_at":"2026-05-29T05:37:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sophrosyne augments fine-grained data APIs with directives to curb over-exploration by Text2SQL agents, reducing it 4.6x and improving accuracy up to 4 points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22905","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EVE-Agent: Evidence-Verifiable Self-Evolving Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T17:47:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EVE-Agent adds an evidence verifier to the proposer-solver loop that rewards spans by marginal accuracy gain, producing self-generated but inspectable training examples for search agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22343","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sibyl-AutoResearch: Autonomous Research Needs Self-Evolving Trial-and-Error Harnesses, Not Paper Generators","primary_cat":"cs.MA","submitted_at":"2026-05-21T11:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sibyl-AutoResearch introduces self-evolving trial-and-error harnesses with auditable conversion units that link trial signals to updated research behaviors and harness repairs in autonomous systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20833","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemGym: a Long-Horizon Memory Environment for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-20T07:25:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemGym unifies agent gyms into a memory benchmark with isolated scoring across tool-use, research, coding, and computer-use regimes plus a lightweight reward model for tractable coding evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20315","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mix-Quant: Quantized Prefilling, Precise Decoding for Agentic LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:50:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mix-Quant quantizes prefilling to NVFP4 and keeps BF16 for decoding in agentic LLMs, achieving up to 3x prefilling speedup while largely preserving task performance on long-context and agentic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17774","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Internalizing Tool Knowledge in Small Language Models via QLoRA Fine-Tuning","primary_cat":"cs.CL","submitted_at":"2026-05-18T02:48:46+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17679","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PULSE: Agentic Investigation with Passive Sensing for Proactive Intervention in Cancer Survivorship","primary_cat":"cs.HC","submitted_at":"2026-05-17T22:39:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PULSE demonstrates that agentic LLM-based investigation of passive smartphone sensing data achieves balanced accuracies of 0.743 (with diary) and 0.713 (sensing-only) for predicting emotion regulation desire and intervention availability in 50 cancer survivors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17625","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Episodic-Semantic Memory Architecture for Long-Horizon Scientific Agents","primary_cat":"cs.AI","submitted_at":"2026-05-17T19:44:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A dual-process memory architecture for scientific AI agents maintains 70-85% accuracy over 15,000 messages by using a constant 10-message episodic window and domain-specific semantic consolidation, consuming 62% fewer tokens than full-context baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17318","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RooAgent: An LLM Agent for Root-Based High Energy Physics Analysis","primary_cat":"hep-ph","submitted_at":"2026-05-17T08:20:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RooAgent provides an LLM agent interface that translates natural-language prompts into calls to PyROOT analysis functions for high energy physics tasks, with support for multiple AI backends and tested on ZH simulations and ATLAS open data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14558","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Resolving Action Bottleneck: Agentic Reinforcement Learning Informed by Token-Level Energy","primary_cat":"cs.LG","submitted_at":"2026-05-14T08:33:02+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ActFocus resolves the action bottleneck in agentic RL by reweighting token gradients toward action tokens using observed reward variance and an energy-based uncertainty term, outperforming PPO and GRPO by up to 65 percentage points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14241","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latency-Quality Routing for Functionally Equivalent Tools in LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-05-14T01:14:13+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14237","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Good to Go: The LOOP Skill Engine That Hits 99% Success and Slashes Token Usage by 99% via One-Shot Recording and Deterministic Replay","primary_cat":"cs.AI","submitted_at":"2026-05-14T01:05:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The LOOP Skill Engine records one LLM-powered run of a periodic task and converts it into a deterministic replay template that eliminates further LLM usage while maintaining high success rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14186","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLMs Know When They Know, but Do Not Act on It: A Metacognitive Harness for Test-time Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-13T23:09:25+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A metacognitive harness uses LLMs' pre- and post-solution self-monitoring signals to control test-time reasoning, raising pooled accuracy from 48.3% to 56.9% on text, code, and multimodal benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14102","ref_index":3,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChromaFlow: A Negative Ablation Study of Orchestration Overhead in Tool-Augmented Agent Evaluation","primary_cat":"cs.AI","submitted_at":"2026-05-13T20:40:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ChromaFlow reports a negative ablation in which expanded orchestration on GAIA Level-1 tasks reduced accuracy and increased tracebacks, timeouts, and token costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13119","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Long-horizon Embodied Agents with Tool-Aligned Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2026-05-13T07:40:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLAs-as-Tools pairs a VLM planner with specialized VLA executors via a new interface and Tool-Aligned Post-Training to raise long-horizon robot success rates on LIBERO-Long and RoboTwin benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11920","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Domain Restriction via Multi SAE Layer Transitions","primary_cat":"cs.AI","submitted_at":"2026-05-12T10:36:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-layer SAE transitions capture domain-specific signatures that distinguish OOD texts in Gemma-2 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11442","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can a Single Message Paralyze the AI Infrastructure? The Rise of AbO-DDoS Attacks through Targeted Mobius Injection","primary_cat":"cs.CR","submitted_at":"2026-05-12T02:51:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Mobius Injection exploits semantic closure in LLM agents to enable single-message AbO-DDoS attacks achieving up to 51x call amplification and 229x latency inflation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This work is expected to serve as a critical wake-up call for the security of the rapidly evolving AI-agentic infrastructure. I. INTRODUCTION The rapid evolution of Large Language Models (LLMs) has catalyzed the transition from passive chatbots to autonomous LLM Agents [1], [2]. These agents, represented by Claude Code [3], OpenClaw [4], and various frameworks [5], [6], are characterized by their ability to perform complex reasoning and execute actions via external \"skills\" or tools [7], [6]. By integrating with local file systems [3], [8], web browsers [9], [10], and third-party APIs through skills or the Model Context Protocol [10], [7], [11], LLM agents have become indispens- able components of modern productivity, effectively acting as"},{"citing_arxiv_id":"2605.11280","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Discovery of Interpretable Surrogates via Agentic AI: Application to Gravitational Waves","primary_cat":"gr-qc","submitted_at":"2026-05-11T22:09:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GWAgent agentic workflow produces analytic surrogates for eccentric BBH waveforms with 6.9e-4 median mismatch and 8.4x speedup, outperforming baselines, and infers eccentricity for GW200129.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"arXiv e-printsarXiv:2201.11903 (2022), doi:10.48550/arXiv.2201.11903. [13] S. Yao,et al., ReAct: Synergizing Reasoning and Acting in Language Models.arXiv e-printsarXiv:2210.03629 (2022), doi:10.48550/arXiv.2210.03629. [14] T. Schick,et al., Toolformer: Language Models Can Teach Themselves to Use Tools.arXiv e-printsarXiv:2302.04761 (2023), doi:10.48550/arXiv.2302.04761. [15] F. Villaescusa-Navarro,et al., The Denario project: Deep knowledge AI agents for scientific discovery.ArXivabs/2510.26887 (2025),https://api.semanticscholar.org/CorpusID:282719399. [16] J. Yang,et al., SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering.arXiv e-prints arXiv:2405.15793 (2024), doi:10.48550/arXiv.2405.15793. [17] C."},{"citing_arxiv_id":"2605.10834","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Controlled to the Wild: Evaluation of Pentesting Agents for the Real-World","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:50:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A practical evaluation protocol for AI pentesting agents that uses validated vulnerability discovery, LLM semantic matching, and bipartite scoring to assess performance in realistic, complex targets.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Taken together, these contributions aim to make evaluation more useful not only for measuring progress, but for making better decisions about which AI pentesting systems are actually worth deploying. 2 Related work Existing evaluation work for AI pentesting agents can be grouped into three main lines. The first relies on CTF-style environments, as in [9], [49], [27], [20], and parts of [25]. These benchmarks provide controlled tasks and cheap automatic scoring, but they usually reduce success to capture-the-flag in closed settings. As a result, they are useful for measuring isolated offensive capability, yet they only weakly reflect realistic pentesting, where agents must explore noisy targets, decide where to focus,"},{"citing_arxiv_id":"2605.09894","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deterministic vs. LLM-Controlled Orchestration for COBOL-to-Python Modernization","primary_cat":"cs.SE","submitted_at":"2026-05-11T02:34:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deterministic orchestration matches LLM-controlled methods in COBOL-to-Python translation accuracy but improves worst-case robustness, reduces run-to-run variability, and cuts token consumption by up to 3.5 times.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09423","ref_index":68,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SimWorld Studio: Automatic Environment Generation with Evolving Coding Agent for Embodied Agent Learning","primary_cat":"cs.AI","submitted_at":"2026-05-10T08:51:50+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"SimWorld Studio deploys an evolving coding agent to create adaptive 3D environments that co-evolve with embodied learners, delivering 18-point success-rate gains over fixed environments in navigation benchmarks.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"CARLA [19] UE4✗+++✗ ✗ ✗ ThreeDWorld [25] Unity✗++✗ ✗ ✗ AI2-THOR [40] Unity✗++✗ ✗ ✗ MineDojo [22] Minecraft✗+✓ ✗ ✗ ProcTHOR [16] Unity✓++✗ ✗ ✗ Habitat 3.0 [56] Habitat-Sim✗++✓ ✗ ✗ MetaUrban [81] PyBullet✓++✓ ✗ ✗ GRUtopia [73] Isaac Sim✓++✓ ✗ ✗ EmbodiedCity [26] UE4✗+++✗ ✗ ✗ UnrealZoo [102] UE4/5✗+++✓ ✗ ✗ Virtual Community [103] Genesis✓++✗ ✗ ✗ VirtualEnv [68] UE5✗+++✗ ✗ ✗ Holodeck [89] AI2-THOR/Unity✓++✗ ✗ ✗ SAGE [83] Isaac Sim✓+++✗ ✓ ✗ GenEnv [28] AlfWorld/Text-only✗+✗ ✗ ✓ SIMWORLDSTUDIOUE5✓+++✓ ✓ ✓ 4 Related Work Embodied simulation platforms.Embodied AI research relies on three families of interactive simulators, each with structural limitations.Hand-built indoor and outdoor platformssupport navigation, manipulation, driving, urban robotics, and language-grounded games [ 40, 42, 56, 43,"},{"citing_arxiv_id":"2605.10999","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillGen: Verified Inference-Time Agent Skill Synthesis","primary_cat":"cs.LG","submitted_at":"2026-05-09T19:24:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillGen synthesizes auditable skills from agent trajectories via contrastive induction on successes and failures, then verifies net performance impact by comparing outcomes with and without the skill on identical tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"In this view,\"repairs\"correspond to Y 0 = 0→Y s = 1, while\"regressions\"correspond toY 0 = 1→Y s = 0. Comparative metrics.We aggregate outcomes via nαβ(s) = mX j=1 1{Y 0(˜xj) =α, Y s(˜xj) =β},(11) with repairs n01(s) and regressions n10(s). The empirical net-effect under this comparison isb∆m(s), b∆m(s) = 1 m mX j=1 Y s(˜xj)−Y 0(˜xj) \u0001 = n01(s)−n 10(s) m , G m(s) =n 01(s)−n 10(s).(12) 5 For a fixed, non-adaptively chosen skill and i.i.d. verification instances,E[b∆m(s)] = ∆(s). •(iii) Refinement: Refinement uses structured feedback to update the skill. • Feedback signals. After each round, the verification agent summarizes the diagnostic evidence rather than sending raw trajectories back to the generation agent. For this, the verification agent partitions instances into"},{"citing_arxiv_id":"2605.09033","ref_index":2,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ShadowMerge: A Novel Poisoning Attack on Graph-Based Agent Memory via Relation-Channel Conflicts","primary_cat":"cs.CR","submitted_at":"2026-05-09T16:16:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ShadowMerge exploits relation-channel conflicts to poison graph-based agent memory, achieving 93.8% average attack success rate on Mem0 and real-world datasets while bypassing existing defenses.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Defense analysis further shows that existing representative input-side defenses are insufficient to mitigate SHADOWMERGE. We have responsibly disclosed our findings to affected graph-memory vendors and open sourced SHADOWMERGE at https://anonymous.4open.science/status/S hadowMerge -033C. I. INTRODUCTION LLM agents are moving from single-turn chatbots [1], [2] toward long-running systems that remember, adapt, and act across repeated interactions [3], [4], [5]. Persistent mem- ory [6], [7], [8], [9], [10] enables this shift by allowing agents to reuse past tool outcomes, maintain user preferences, and carry task context across sessions. Among memory designs, graph-based agent memory [11], [12], [13] has emerged as"},{"citing_arxiv_id":"2605.08904","ref_index":133,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPT-BENCH: Evaluating the Iterative Self-Optimization of LLM Agents in Large-Scale Search Spaces","primary_cat":"cs.AI","submitted_at":"2026-05-09T11:51:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPT-BENCH and OPT-Agent evaluate LLM self-optimization in large search spaces, showing stronger models improve via feedback but stay constrained by base capacity and below human performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08386","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillLens: Adaptive Multi-Granularity Skill Reuse for Cost-Efficient LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-08T18:48:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillLens organizes skills into policies-strategies-procedures-primitives layers, retrieves via degree-corrected random walk, and uses a verifier for local adaptation, yielding up to 6.31 pp gains on MuLocbench and raising ALFWorld success from 45% to 51.31%.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"for externalizing such procedural experience. Instead of relying on the model to rediscover useful behavior from scratch for every new task, an agent can retrieve relevant prior skills and use them as task-time guidance without retraining the underlying model. Existing approaches to skill externalization fall broadly into three lines: episodic verbal-feedback memories such as Reflexion [17]; flat skill libraries that incrementally collect executable behaviors, exemplified by V oyager [21] and SkillAct [9]; and natural-language manuals or insights distilled from trajectories, such as ExpeL [34] and AutoManual [3]. More recent systems push toward richer memory infrastructures, including dynamically linked agentic memory [ 25], procedural memory lifecycle"},{"citing_arxiv_id":"2605.07836","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unsafe by Flow: Uncovering Bidirectional Data-Flow Risks in MCP Ecosystem","primary_cat":"cs.SE","submitted_at":"2026-05-08T15:03:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MCP-BiFlow detects 93.8% of known bidirectional data-flow vulnerabilities in MCP servers and identifies 118 confirmed issues across 87 real-world servers from a scan of 15,452 repositories.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"MCP settings, both directions are security-relevant because tool inputs may trigger privileged effects and tool outputs may re-enter LLM-mediated reasoning. Existing work does not fully address this setting. Recent MCP security studies have mainly characterized ecosystem risks, attack patterns, and defenses [21, 23, 50, 66], while general-purpose an- alyzers such as CodeQL [15] and Semgrep [52] are not designed to recover MCP-specific interaction structure end to end. System- atic MCP analysis faces three concrete difficulties. (1) MCP servers expose externally reachable functionality through diverse tool regis- tration, protocol-dispatch, and wrapper patterns rather than stable route abstractions. (2) Generic taint templates are often mismatched"},{"citing_arxiv_id":"2605.07358","ref_index":21,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Comprehensive Survey on Agent Skills: Taxonomy, Techniques, and Applications","primary_cat":"cs.IR","submitted_at":"2026-05-08T07:10:26+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"anda t is the resulting action. Recent systems such as Open- Claw and Manus exemplify this shift from passive response generation to action-oriented execution [7], [8]. What separates modern agents from standalone LLMs is that they canact: they query external systems, invoke tools, write and execute code, and coordinate with other agents [5], [6], [21]. That action capability is the foundation of the capability stack studied in this survey. B. Agent Knowledge and the Procedural Gap Agent behavior depends not only on the reasoning ability of the base model, but also on what knowledge is available when the agent must act. Here, we provide a simple distinction between passive and active knowledge to clarify where agent"},{"citing_arxiv_id":"2605.07313","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Stored Evidence Stops Being Usable: Scale-Conditioned Evaluation of Agent Memory","primary_cat":"cs.AI","submitted_at":"2026-05-08T06:22:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new evaluation protocol shows agent memory reliability degrades variably with added irrelevant sessions depending on agent, memory interface, and scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07248","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PaT: Planning-after-Trial for Efficient Test-Time Code Generation","primary_cat":"cs.CL","submitted_at":"2026-05-08T05:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaT defers planning until after failed trials in LLM code generation, enabling heterogeneous cheap-plus-powerful model setups that match large-model performance at roughly 69% lower cost.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"these components gives the total asymptotic cost: nk n−1 \u0012 cM + DL pM \u0013 .(20) For the heterogeneous setting to be more cost- efficient than the homogeneous one, it requires: nk n−1 \u0012 cs + DL ps \u0013 < nk n−1 \u0012 cL + DL pL \u0013 . (21) The nk n−1 term cancels. Rearranging the remain- ing terms to solve for DL yields the condition stated in the theorem: DL < cL −c s 1 ps − 1 pL .(22) This theorem provides a theoretical basis for the efficiency of heterogeneous model configuration. It indicates that the heterogeneous configuration becomes more cost-efficient when the cost of de- composition (DL) is less than the savings gener- ated by executing the sub-problems with a more cost-effective model (s). This provides a formal Generator Planner HumanEval MBPP Avg."},{"citing_arxiv_id":"2605.06890","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond the Black Box: Interpretability of Agentic AI Tool Use","primary_cat":"cs.AI","submitted_at":"2026-05-07T19:47:30+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06584","ref_index":20,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NeuroAgent: LLM Agents for Multimodal Neuroimaging Analysis and Research","primary_cat":"cs.AI","submitted_at":"2026-05-07T17:13:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NeuroAgent uses a hierarchical LLM agent framework with Generate-Execute-Validate loops to automate neuroimaging preprocessing, reaching 84.8% end-to-end correctness and 0.9518 AUC for Alzheimer's classification on 1470 ADNI subjects using four modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06737","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Self-Healing Framework for Reliable LLM-Based Autonomous Agents","primary_cat":"cs.SE","submitted_at":"2026-05-07T13:10:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A framework that monitors LLM agent behavior, assesses reliability, and automatically heals failures to raise task success rates in multi-agent workflows.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"models can learn to use external tools to enhance task performance. HuggingGPT [ 3] further extends this idea by orchestrating multiple AI models to solve complex task s collaboratively. In addition, generative agent frameworks simulate human -like behaviors using LLMs, highlighting the potential of agent -based systems in interactive environments [ 12]. While these approaches demonstrate strong capabilities, they primarily focus on task performance and lack mechanisms for ensuring reliability and robustness during execution. 2.2. Reliability and Failure Handling in LLM Systems Reliability issues in LLMs have attracted increasing attention, particularly in relation to hallucination and reasoning errors [ 4, 13]."},{"citing_arxiv_id":"2605.05758","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BioTool: A Comprehensive Tool-Calling Dataset for Enhancing Biomedical Capabilities of Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T06:53:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BioTool dataset enables fine-tuning a 4B-parameter LLM to outperform GPT-5.1 in biomedical tool calling while improving downstream answer quality per human experts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05287","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Securing the Agent: Vendor-Neutral, Multitenant Enterprise Retrieval and Tool Use","primary_cat":"cs.CR","submitted_at":"2026-05-06T17:59:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A server-side architecture with policy-aware ingestion and ABAC-based retrieval gating prevents cross-tenant data leakage in multitenant enterprise RAG and agent systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03989","ref_index":12,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Agent-Oriented Pluggable Experience-RAG Skill for Experience-Driven Retrieval Strategy Orchestration","primary_cat":"cs.AI","submitted_at":"2026-05-05T17:10:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Experience-RAG Skill is a reusable agent skill that selects retrieval strategies via experience memory, achieving 0.8924 nDCG@10 on BeIR/nq, hotpotqa, and scifact while outperforming fixed retriever baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03312","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemFlow: Intent-Driven Memory Orchestration for Small Language Model Agents","primary_cat":"cs.MA","submitted_at":"2026-05-05T02:57:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemFlow routes queries by intent to tiered memory operations, nearly doubling accuracy of a 1.7B SLM on long-horizon benchmarks compared to full-context baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"in resource-constrained deployments [1, 24, 35, 6, 52]. RAG [22, 21] mitigates overflow by selecting evidence, but uniform retrieval cannot serve the structural diversity of long-horizon queries [43, 4]: preferences, timelines, knowledge updates, and multi-session synthesis require different retrieval strategies, transformations, and budgets. ReAct- style reasoning [46] and learned tool use [40] provide flexibility, but sub-3B agents frequently produce hallucinated calls or broken reasoning traces [12]. We therefore frame a substantial portion of SLM memory failure as anintent-routing mismatch: for SLM agents, the central question is not only what to retrieve, but which memory operation the query requires. Preprint. arXiv:2605.03312v1 [cs."},{"citing_arxiv_id":"2605.01698","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BIM Information Extraction Through LLM-based Adaptive Exploration","primary_cat":"cs.CL","submitted_at":"2026-05-03T03:40:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM adaptive exploration via runtime code execution outperforms static query generation for information extraction from heterogeneous BIM models on the new ifc-bench v2 benchmark.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tor programs into reusable libraries, achieving approximately 26% improve- ment on date-understanding tasks. Huang et al. [38] report improvements in code generation accuracy when using multi-agent architectures with indepen- dent test generation compared to single-agent baselines. Other frameworks (HuggingGPT [39], Gorilla [40], Toolformer [41]) further establish the fea- sibility of automated tool creation. TroVE [42] additionally reports 31% faster human verification of generated toolboxes, though compute-matched re-evaluation [43] suggests its accuracy gains may reflect higher compute al- location rather than the toolbox mechanism itself. However, none of these approaches has been applied to structured data extraction domains such as BIM. 12 3. Proposed Method"},{"citing_arxiv_id":"2605.01392","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Using LLMs in Software Design: An Empirical Study of GitHub and A Practitioner Survey","primary_cat":"cs.SE","submitted_at":"2026-05-02T11:29:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Developers use LLMs like ChatGPT mainly for knowledge acquisition and code generation at the detailed design level, reporting benefits such as better technology selection and early flaw detection alongside limitations like lengthy outputs, incorrect code, and hallucinations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00943","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ARIS: Agentic and Relationship Intelligence System for Social Robots","primary_cat":"cs.RO","submitted_at":"2026-05-01T07:11:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"ARIS integrates a graph-based Social World Model, RAG, and agentic architecture for social robots and reports higher user ratings for intelligence, animacy, anthropomorphism, and likeability than an LLM baseline in a 23-person study with the Pepper robot.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27132","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRUST: A Framework for Decentralized AI Service v.0.1","primary_cat":"cs.AI","submitted_at":"2026-04-29T19:32:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRUST is a decentralized AI auditing framework that decomposes reasoning into HDAGs, maps agent interactions via the DAAN protocol to CIGs, and uses stake-weighted multi-tier consensus to achieve 72.4% accuracy while proving a Safety-Profitability Theorem that rewards honest auditors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24657","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AgentWard: A Lifecycle Security Architecture for Autonomous AI Agents","primary_cat":"cs.CR","submitted_at":"2026-04-27T16:22:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AgentWard organizes stage-specific security controls with cross-layer coordination to intercept threats across the full lifecycle of autonomous AI agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24579","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Measuring the Unmeasurable: Markov Chain Reliability for LLM Agents","primary_cat":"cs.SE","submitted_at":"2026-04-27T15:05:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TraceToChain models LLM agent traces as absorbing DTMCs using automatic clustering and smoothed MLE, with KS and AIC validation, to reconcile pass@k, pass^k, and RDC as projections of a single first-passage success-time distribution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24026","ref_index":23,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Skill Text to Skill Structure: The Scheduling-Structural-Logical Representation for Agent Skills","primary_cat":"cs.CL","submitted_at":"2026-04-27T04:25:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SSL representation disentangles skill scheduling, structure, and logic using an LLM normalizer, improving skill discovery MRR@50 from 0.649 to 0.729 and risk assessment macro F1 from 0.409 to 0.509 over text baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}