{"total":44,"items":[{"citing_arxiv_id":"2606.10711","ref_index":127,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Agentic Web Requires New Normative Infrastructure","primary_cat":"cs.CY","submitted_at":"2026-06-09T11:15:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The agentic web requires new normative infrastructure of laws, norms, and practices to allow user-delegated AI agents to access online properties without being blocked as malicious bots.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10106","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What makes a harness a harness: necessary and sufficient conditions for an agent harness","primary_cat":"cs.SE","submitted_at":"2026-06-08T19:35:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes and tests a constitutive definition of 'agent harness' via conceptual analysis of literature and six real systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29532","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GUITestScape: Towards Open-set Evaluation on Exploratory GUI Testing","primary_cat":"cs.SE","submitted_at":"2026-05-28T07:47:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GUITestScape supplies an interactive benchmark for exploratory GUI testing and GUIJudge supplies an open-set process-aware evaluator that outperforms baselines on MLLM agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29400","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Architecture-Sensitive Supervised Fine-Tuning for Screen-Conditioned Action Prediction: A PiSAR Benchmark","primary_cat":"cs.AI","submitted_at":"2026-05-28T05:49:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fine-tuned Qwen3-VL-8B reaches sem_sim 0.783 on PiSAR held-out set vs 0.46-0.48 for frontier zero-shot, while Gemma-4-26B scores 0.441.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26807","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HTMLCure: Turning Browser Experience into State Guided Repair for Interactive HTML","primary_cat":"cs.SE","submitted_at":"2026-05-26T10:22:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTMLCure uses browser-executed interaction trajectories to diagnose and repair LLM HTML outputs, expanding 97K prompts into a 40K refined SFT set that lifts a 27B model to 50.6 on HTMLBench-400 and 81.2 on MiniAppBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25343","ref_index":124,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Native Multimodal Modeling: A Roadmap","primary_cat":"cs.CV","submitted_at":"2026-05-25T01:57:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A roadmap that defines architectural nativity for multimodal models and categorizes them into Multi-to-Text, Multi-to-Target, and Multi-to-Multi types while outlining an industrial pipeline toward unified transformer-based native multimodal modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23262","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Design and Report Benchmarks for Knowledge Work","primary_cat":"cs.AI","submitted_at":"2026-05-22T06:03:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes a three-step benchmark design method (define work activity, specify tested setting, score work product) derived from work studies and O*NET, demonstrated via three case analyses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19743","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EngiAI: A Multi-Agent Framework and Benchmark Suite for LLM-Driven Engineering Design","primary_cat":"cs.AI","submitted_at":"2026-05-19T12:12:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19149","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent Meltdowns: The Road to Hell Is Paved with Helpful Agents","primary_cat":"cs.CL","submitted_at":"2026-05-18T22:03:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper defines accidental meltdowns as unsafe agent behavior triggered by benign errors and reports that such meltdowns occur in 64.7% of evaluated rollouts across GPT, Grok, and Gemini agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19099","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DecisionBench: A Benchmark for Emergent Delegation in Long-Horizon Agentic Workflows","primary_cat":"cs.AI","submitted_at":"2026-05-18T20:37:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DecisionBench supplies a fixed task suite, model pool, delegation interface, and multi-axis metrics to evaluate emergent delegation, showing similar quality across awareness conditions but 15-31 point headroom under perfect delegation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18636","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPIKE: An Adaptive Dual Controller Framework for Cost-Efficient Long-Horizon Game Agents","primary_cat":"cs.CV","submitted_at":"2026-05-18T16:43:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPIKE dual-controller framework raises success rates 5-9 points and cuts tokens 55% in StarDojo agents by reusing strategic plans across stable segments and escalating only at detected events.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13527","ref_index":8,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MMSkills: Towards Multimodal Skills for General Visual Agents","primary_cat":"cs.AI","submitted_at":"2026-05-13T13:40:31+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"3 Skill Generator from Public Trajectories We build MMSkills from public interaction trajectories that are separate from evaluation tasks. A trajectory is τi = (I i, Oi,1:Ti , Ai,1:Ti ),(7) whereI i is the task instruction,O i,t are visual observations,A i,t are executed actions. The Generator is controlled by a reusable multimodal-skill-factory meta-skillF: GF :T d 7→ M d,(8) 4 whereT d is the public trajectory pool for domaindandM d is the generated domain skill library. The pipeline comprises five stages: Td Phase 0: embed+cluster − − − − − − − − − − − − → Cd Phase 1: cluster plan − − − − − − − − − − → Ad Phase 2: merge − − − − − − − − → Rd Phase 3: text draft − − − − − − − − − →cMd Phase 4: image ground+audit − − − − − − − − − − − − − − − → Md."},{"citing_arxiv_id":"2605.11533","ref_index":5,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Checkup2Action: A Multimodal Clinical Check-up Report Dataset for Patient-Oriented Action Card Generation","primary_cat":"cs.CL","submitted_at":"2026-05-12T04:58:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Checkup2Action is a new multimodal dataset and benchmark for generating safe, prioritized action cards from real-world clinical check-up reports using large language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10663","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evolving-RL: End-to-End Optimization of Experience-Driven Self-Evolving Capability within Agents","primary_cat":"cs.AI","submitted_at":"2026-05-11T14:43:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Evolving-RL jointly optimizes experience extraction and utilization in LLM agents via RL with separate evaluation signals, delivering up to 98.7% relative gains on out-of-distribution tasks in ALFWorld and Mind2Web.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"encounters both informative and noisy skills during the same rollout, which mirrors the conditions it faces at test time and builds robustness accordingly. 4 Experiments 4.1 Experimental Setup We instantiate all methods with Qwen2.5-7B-Instruct [32] as the base model and evaluate them on two benchmarks with explicit task-level splits: ALFWorld [23] and Mind2Web [7]. We primarily compare our approach against two categories of baselines: prompt-based experience-driven self- evolution methods ExpeL [38], Memento [40] and ReasoningBank [16] and the RL-based method, GRPO [19] and SkillRL [30]. However, since SkillRL relies on a certain amount of cold-start data, we are unable to evaluate it on Mind2Web. To ensure robustness, all reported results are averaged"},{"citing_arxiv_id":"2605.05509","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WAAA! Web Adversaries Against Agentic Browsers","primary_cat":"cs.CR","submitted_at":"2026-05-06T23:19:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Agentic browsers are vulnerable to 20 web and LLM attacks with 18 implemented, exposing five failure modes across four major LLM models that require redesign before safe deployment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.28139","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Claw-Eval-Live: A Live Agent Benchmark for Evolving Real-World Workflows","primary_cat":"cs.SE","submitted_at":"2026-04-30T17:23:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Claw-Eval-Live benchmark with 105 tasks shows no frontier LLM agent exceeds 66.7% success rate on evolving real-world workflows, with HR and multi-system tasks as persistent bottlenecks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"workspace repair, and that several business-critical families remain far from solved. 2 Related Work Agent benchmarks.Evaluating LLM-based agents has produced benchmarks spanning tool use, web interaction, desktop environments, and professional tasks. General suites such as AgentBench [23] and GAIA [26] test heterogeneous agent capabilities, while WebArena [54], VisualWebArena [17], Mind2Web [6], BrowserGym [4], WebCanvas [30], AssistantBench [50], and OSWorld [43] make browser or desktop interaction central. Professional and workplace-oriented benchmarks such as WorkArena [8] and TheAgentCompany [45] move closer to deployed work settings. Claw-Eval-Live is complementary to this line: its emphasis is not interface realism alone, but a workflow mixture"},{"citing_arxiv_id":"2604.26622","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OCR-Memory: Optical Context Retrieval for Long-Horizon Agent Memory","primary_cat":"cs.CL","submitted_at":"2026-04-29T12:49:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OCR-Memory encodes agent trajectories as images with visual anchors and retrieves verbatim text via locate-and-transcribe, yielding gains on long-horizon benchmarks under strict context limits.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Formally, given an input image I∈ RH×W×3 , the image encoder produces compressed embeddings Z=f enc(I)∈R n(r)×dlatent,(3) where r denotes the resolution mode and n(r) is the resulting compressed-token budget. To enable controllable compression ratios, fenc equips with multiple resolution modes, which is preset by the chosen input size: n(r)∈ {64,100,256,400},(4) corresponding to 512×512, 640×640, 1024×1024, and1280×1280inputs, respectively. 4 OCR-Memory We now introduce Optical Context Retrieval for Agent Memory (OCR-Memory), a paradigm that shifts memory storage and retrieval from text do- main to the image domain, so that a memory mod- ule with a limited text context window can still consult arbitrarily long histories with minimal to-"},{"citing_arxiv_id":"2604.19742","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PlayCoder: Making LLM-Generated GUI Code Playable","primary_cat":"cs.SE","submitted_at":"2026-04-21T17:59:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PlayCoder raises the rate of LLM-generated GUI apps that can be played end-to-end without logic errors from near zero to 20.3% Play@3 by adding repository-aware generation, agent-driven testing, and iterative repair.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"on reliable automation for common development routines. Beyond agents, recent benchmarks increasingly emphasize pragmatic coding competence [10, 21, 54]. LiveCodeBench [24] reduces evaluation leakage by continuously refreshing problem sets. CoderEval [76] foregrounds realistic coding scenarios, and Evocodebench [32] stresses generation inside practical software projects. ClassEval [17] shifts attention from isolated functions to class- level synthesis. Finally, SWE-Bench [26] measures GitHub issue resolution and has catalyzed a wave of follow-on studies on practical software engineering tasks [1, 2, 19, 20, 34, 60, 67, 71, 78, 79]. However, these benchmarks largely omit behavioral correctness for generatedGUIapplications. Moreover, mainstream generation stacks still emphasize compile- and test-oriented functional"},{"citing_arxiv_id":"2604.14262","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GUI-Perturbed: Domain Randomization Reveals Systematic Brittleness in GUI Grounding Models","primary_cat":"cs.LG","submitted_at":"2026-04-15T16:39:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GUI-Perturbed shows that GUI grounding models suffer systematic accuracy collapse under relational instructions and visual changes such as 70% zoom, with even augmented fine-tuning worsening results.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"robotics [7], where textures, lighting, and colors are randomized in simulation to force policies to learn invariances. Training on fixed screenshots is the GUI equivalent of training on a single simulator skin. To quantify these failures, we constructGUI-Perturbed, a controlled perturbation framework that applies domain randomization to GUI grounding evaluation. Using Mind2Web MHTML archives [4] as our simulator and Playwright as the rendering engine, we perturb both the visual scene (style changes, zoom, text scaling) and the instruction (direct vs. spatial-relational) along independent axes. Evaluating three 7B models from the same Qwen2.5VL-7B [ 8] lineage (Qwen2.5VL-7B, UI-TARS-1.5-7B [9], and GTA1-7B [10]), we report the following findings:"},{"citing_arxiv_id":"2604.07776","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Structured Distillation of Web Agent Capabilities Enables Generalization","primary_cat":"cs.LG","submitted_at":"2026-04-09T04:04:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Structured synthetic trajectory generation from Gemini 3 Pro enables a 9B open-weight model to reach 41.5% on WebArena, outperforming Claude 3.5 Sonnet and GPT-4o while generalizing to unseen enterprise environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09581","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Avenir-UX: Automated UX Evaluation via Simulated Human Web Interaction with GUI Grounding","primary_cat":"cs.AI","submitted_at":"2026-02-25T18:59:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Avenir-UX automates web usability testing by using GUI-grounded simulation of user behavior to generate standardized reports with SUS, SEQ, and Think Aloud protocols.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.20867","ref_index":61,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SoK: Agentic Skills -- Beyond Tool Use in LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-02-24T13:11:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper systematizes agentic skills beyond tool use, providing design pattern and representation-scope taxonomies plus security analysis of malicious skill infiltration in agent marketplaces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09571","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tuning Qwen2.5-VL to Improve Its Web Interaction Skills","primary_cat":"cs.HC","submitted_at":"2026-02-20T13:35:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Two-stage fine-tuning of Qwen2.5-VL-32B improves success rates on single-click web tasks from 86% to 94%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.14348","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Legal Retrieval for Public Defenders","primary_cat":"cs.IR","submitted_at":"2026-01-20T17:08:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NJ BriefBank is a domain-adapted legal retrieval tool for public defenders that improves on standard benchmarks by incorporating legal reasoning, domain data, and synthetic examples, with a new released taxonomy and annotated evaluation dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.22074","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Real-Time Procedural Learning From Experience for AI Agents","primary_cat":"cs.AI","submitted_at":"2025-11-27T03:51:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRAXIS enables AI agents to acquire procedural knowledge in real time by indexing and retrieving state-action-result experiences, leading to better accuracy, reliability, and efficiency on web browsing benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.06101","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SynthAgent: Adapting Web Agents with Synthetic Supervision","primary_cat":"cs.LG","submitted_at":"2025-11-08T18:45:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SynthAgent uses dual refinement of synthetic tasks and trajectories to produce higher-quality training data that improves web agent adaptation to target environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.23883","ref_index":166,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic AI Security: Threats, Defenses, Evaluation, and Open Challenges","primary_cat":"cs.AI","submitted_at":"2025-10-27T21:48:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A survey that taxonomizes threats to agentic AI, reviews benchmarks and evaluation methods, discusses technical and governance defenses, and identifies open challenges.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"repeated, long-horizon, or slightly varied tasks. Even when tasks are derived from the same underlying template, performance variance is significant: GPT-4 succeeds consistently on only 4 out of 61 templates in WebArena [162]. Similar brittleness has been observed in other benchmarks: Mind2Web reports that template-derived variations in web tasks often lead to sharp drops in success rates [ 166], while BrowserGym identifies instability in reproducing outcomes across different environments and interface states [163]. These findings underscore the limitations of relying on surface-level patterns without robust memory or adaptive feedback mechanisms. To bridge this gap the WebArena benchmark was proposed as a testbed for approaches that explicitly incorporate memory and feedback to improve"},{"citing_arxiv_id":"2510.22933","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Can AI Augment Access to Justice? Public Defenders' Perspectives on AI Adoption","primary_cat":"cs.CY","submitted_at":"2025-10-27T02:26:08+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Public defenders view AI as most useful for evidence investigation but limited in courtroom work and strategy, with adoption blocked by costs, confidentiality risks, and norms, requiring human oversight and open development.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"org/wp-content/uploads/2025/01/ 20250125-Integrating-AI-A-Guide-for-Prosecutors.pdf. [28] Sunhao Dai, Chen Xu, Shicheng Xu, Liang Pang, Zhenhua Dong, and Jun Xu. Bias and unfair- ness in information retrieval systems: New challenges in the llm era. InProceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pages 6437-6447, 2024. [29] Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su. Mind2web: Towards a generalist agent for the web, 2023. URLhttps://arxiv.org/ abs/2306.06070. [30] David C. Donald. Bias in ai large language models: Risks and remedies. https://www.americanbar.org/groups/gpsolo/resources/magazine/2025-mar-apr/ bias-ai-large-language-models-risks-remedies/, April 2025."},{"citing_arxiv_id":"2509.02544","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UI-TARS-2 Technical Report: Advancing GUI Agent with Multi-Turn Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2025-09-02T17:44:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"UNKNOWN","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UI-TARS-2 reaches 88.2 on Online-Mind2Web, 47.5 on OSWorld, 50.6 on WindowsAgentArena, and 73.3 on AndroidWorld while attaining 59.8 mean normalized score on a 15-game suite through multi-turn RL and scalable data generation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"long-horizon control and strategic exploration are essential. Digital games have historically been central to AI research due to their complexity, diversity, and controllability. Seminal work ranges from classical board games such as Go [60], to Atari benchmarks [9], to large-scale strategy games like StarCraft II [37], and open-ended environments such as Minecraft [17]. However, a key limitation of these efforts is their specificity: agents were typically optimized for a single game with tailored policies and parameters, hindering generalization across different environments [8, 38, 67]. The emergence of LLMs and VLMs has shifted attention toward more generalist agents [53]. Recent work explores their application to complex game scenarios, such as Pokémon [4, 14]."},{"citing_arxiv_id":"2507.04227","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mobile GUI Agents under Real-world Threats: Are We There Yet?","primary_cat":"cs.CR","submitted_at":"2025-07-06T03:31:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces an app-content instrumentation framework and benchmark showing that examined GUI agents suffer 42.0% and 36.1% average misleading rates from third-party content in dynamic and static tests respectively.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.10924","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey on the Safety and Security Threats of Computer-Using Agents: JARVIS or Ultron?","primary_cat":"cs.CL","submitted_at":"2025-05-16T06:56:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey that defines Computer-Using Agents for safety analysis, categorizes their threats, proposes a taxonomy of defensive strategies, and summarizes benchmarks and datasets for evaluating CUA safety and performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.23218","ref_index":68,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OS-ATLAS: A Foundation Action Model for Generalist GUI Agents","primary_cat":"cs.CL","submitted_at":"2024-10-30T17:10:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OS-Atlas, trained on the largest open-source cross-platform GUI grounding corpus of 13 million elements, outperforms prior open-source models on six benchmarks across mobile, desktop, and web platforms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.09187","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GuardAgent: Safeguard LLM Agents by a Guard Agent via Knowledge-Enabled Reasoning","primary_cat":"cs.LG","submitted_at":"2024-06-13T14:49:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GuardAgent safeguards LLM agents by generating task plans from safety requests and mapping them to executable guardrail code, achieving over 98% accuracy on a healthcare access-control benchmark and 83% on a web safety benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.07972","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments","primary_cat":"cs.AI","submitted_at":"2024-04-11T17:56:05+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"OSWorld provides the first unified real-computer benchmark for open-ended multimodal agent tasks, exposing large performance gaps between humans and state-of-the-art LLM/VLM agents.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"such multimodal agents is the absence of a benchmark based on a real interactive environment that covers the diversity and complexity of real-world computer use across various operating systems, interfaces, and applications, consequently restricting task scope and agent scalability. Previous benchmarks provide datasets of demonstrations without executable environments [9, 40, 21]. Their non-execution-based evaluation assumes a single solution for each task and wrongfully penalizes alternative correct solutions. These benchmarks also miss opportunities for essential Preprint. Under review. arXiv:2404.07972v2 [cs.AI] 30 May 2024 Task Instruction (See examples above) input Agent (e.g., GPT-4V) a11y-tree screenshot"},{"citing_arxiv_id":"2403.07974","ref_index":160,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","primary_cat":"cs.SE","submitted_at":"2024-03-12T17:58:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiveCodeBench collects 400 recent contest problems to create a contamination-free benchmark evaluating LLMs on code generation and related capabilities like self-repair and execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07718","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"WorkArena: How Capable Are Web Agents at Solving Common Knowledge Work Tasks?","primary_cat":"cs.LG","submitted_at":"2024-03-12T14:58:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WorkArena benchmark shows LLM web agents achieve partial success on enterprise tasks but have a substantial gap to full automation and perform worse with open-source models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.02691","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents","primary_cat":"cs.CL","submitted_at":"2024-03-05T06:21:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InjecAgent benchmark demonstrates that tool-integrated LLM agents are vulnerable to indirect prompt injection attacks, with ReAct-prompted GPT-4 succeeding on 24% of attacks and nearly twice that rate when attacker instructions are reinforced.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tacker Instruction>', 'timestamp': '2022- 02-23 18:00'}} TwitterManager- GetUserProfile Get me the Twitter profile for the user 'AdaLovelace'. {'profile': {'user_id': 'AdaLovelace', 'user_name': 'Ada Lovelace', 'dis- play_name': 'Ada Lovelace', 'bio': '<Attacker Instruction>'}} Low TodoistSearchTasks What tasks related to 'IT Audit' have been com- pleted? {'tasks': [{'task_id': 'cd456R', 'task_name': 'IT Audit task', 'de- scription': '<Attacker Instruction>', 'due_date': '2022-02-15', 'priority': 2, 'status': 'completed'}]} GoogleCalendar- GetEvents- FromSharedCalendar Check Amy Watson's shared calendar and tell me if she is available on February 28th from 2 PM to 3 PM. {'event_blocks': [{'event_name': '<At- tacker Instruction>', 'start_time': '2022- 02-28T14:00', 'end_time': '2022-02- 28T15:00'}]} Shopify- GetProductDetails Can you find the details for the laptop with prod-"},{"citing_arxiv_id":"2402.02716","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Understanding the planning of LLM agents: A survey","primary_cat":"cs.AI","submitted_at":"2024-02-05T04:25:24+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A survey that provides a taxonomy of methods for improving planning in LLM-based agents across task decomposition, plan selection, external modules, reflection, and memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.13919","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models","primary_cat":"cs.CL","submitted_at":"2024-01-25T03:33:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WebVoyager uses a large multimodal model to complete real-world web tasks end-to-end and reaches 59.1 percent success on a new benchmark of 15 live sites, with an automatic GPT-4V evaluator that matches human judgments 85 percent of the time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.10935","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SeeClick: Harnessing GUI Grounding for Advanced Visual GUI Agents","primary_cat":"cs.HC","submitted_at":"2024-01-17T08:10:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SeeClick improves visual GUI agents via GUI grounding pre-training on automatically curated data and introduces the ScreenSpot benchmark, with results indicating that stronger grounding boosts downstream task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.03568","ref_index":245,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent AI: Surveying the Horizons of Multimodal Interaction","primary_cat":"cs.AI","submitted_at":"2024-01-07T19:11:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper defines Agent AI as interactive multimodal systems that perceive grounded data and generate embodied actions, arguing this approach can mitigate hallucinations in foundation models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.01614","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GPT-4V(ision) is a Generalist Web Agent, if Grounded","primary_cat":"cs.IR","submitted_at":"2024-01-03T08:33:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GPT-4V achieves 51.1% success on live web tasks as a generalist agent when plans are manually grounded, outperforming text-only models, but automatic grounding lags far behind oracle performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.16797","ref_index":91,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Promptbreeder: Self-Referential Self-Improvement Via Prompt Evolution","primary_cat":"cs.CL","submitted_at":"2023-09-28T19:01:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Promptbreeder evolves both task prompts and the mutation prompts that improve them using LLMs, outperforming Chain-of-Thought and Plan-and-Solve on arithmetic and commonsense reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.02427","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cognitive Architectures for Language Agents","primary_cat":"cs.AI","submitted_at":"2023-09-05T17:56:20+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoALA is a modular cognitive architecture for language agents that organizes memory components, action spaces for internal and external interaction, and a generalized decision-making loop to support more systematic development of capable agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}