{"total":88,"items":[{"citing_arxiv_id":"2606.27632","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Yuvion LLM: An Adversarially-Aware Large Language Model for Content And AI Safety","primary_cat":"cs.CL","submitted_at":"2026-06-26T01:12:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Yuvion LLM applies adversarially aware training and introduces the YLRE benchmark set, claiming superior safety robustness over larger models on multiple tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05566","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GuardNet: Ensemble Strategies of Shallow Neural Networks for Robust Prompt Injection and Jailbreak Detection","primary_cat":"cs.AI","submitted_at":"2026-06-04T01:24:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"GuardNet ensemble of BiLSTMs reaches AUROC 0.747 on blind n=200 test and F1 0.92 on proprietary n=50 set with 50 ms CPU latency for PI/JB detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00485","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Confused ChatGPT: Cross-App Context Poisoning via First-Party APIs","primary_cat":"cs.CR","submitted_at":"2026-05-30T02:37:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Identifies cross-app context poisoning in ChatGPT Apps, a persistent indirect prompt injection delivered through undocumented first-party API parameters that lets one app manipulate others via the shared untagged context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31042","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Prompt Injection to Persistent Control: Defending Agentic Harness Against Trojan Backdoors","primary_cat":"cs.CR","submitted_at":"2026-05-29T09:19:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces ClawTrojan benchmark achieving 95.5% ASR for multi-step trojan attacks in agentic harnesses and DASGuard defense that sanitizes control content from untrusted sources.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30686","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Depth-Dependent Indirect Prompt Injection in Tool-Calling ReAct Agents: Injection Depth, Payload Framing, and Turn-Budget Sensitivity","primary_cat":"cs.CR","submitted_at":"2026-05-29T00:28:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Controlled experiments on GPT-4o-mini and Claude Haiku show indirect prompt injection success in ReAct agents decays sharply with injection depth, varies with payload framing, and remains stable across turn budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30650","ref_index":93,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When AI Meets Wall Street: A Survey on Trustworthy AI in Fintech","primary_cat":"cs.CR","submitted_at":"2026-05-28T23:10:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A survey that proposes a lifecycle-centric framework and the Financial AI Security and Robustness Taxonomy to organize 17 attack subtypes on AI pipelines in finance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30534","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Strengthening Polymorphic Prompt Assembling: Dynamic Separator Generation Against Emerging Prompt Injection Attacks","primary_cat":"cs.CR","submitted_at":"2026-05-28T20:10:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dynamic separator generation via domain-separated SHA-256 reduces attack success rate from 0.88 to 0.38 and eliminates leakage exposure in evaluations against 16 payloads on Llama and DeepSeek models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30454","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Surface You Test Is Not the Surface That Breaks","primary_cat":"cs.CR","submitted_at":"2026-05-28T18:26:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prompt injection vulnerability in tool-augmented LLMs is a model-surface interaction rather than a fixed channel property; the same payload inverts success rates across models, and adaptive attack rate exceeds single-surface baselines by 9.1 pp on average.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28999","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Measuring Real-World Prompt Injection Attacks in LLM-based Resume Screening","primary_cat":"cs.CR","submitted_at":"2026-05-27T18:56:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Roughly 1% of real resumes contain hidden prompt injections against LLM screeners, prevalence has risen over 1-2 years, and over 90% avoid explicit instructions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23196","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prompt Overflow: What the Guardrail Inspects Is Not What the Model Infers","primary_cat":"cs.CR","submitted_at":"2026-05-22T03:27:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces Prompt Overflow Attack that fragments malicious instructions in overlength prompts to evade guardrail segmentation while remaining actionable to LLMs with larger context windows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22321","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking Autonomous Agents against Temporal, Spatial, and Semantic Evasions","primary_cat":"cs.CR","submitted_at":"2026-05-21T11:07:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A3S-Bench evaluates LLM agents against temporal, spatial, and semantic evasions, raising average risk trigger rates from 28.3% to 52.6% across 2,254 trajectories and 20 scenarios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22001","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Blind Spots in the Guard: How Domain-Camouflaged Injection Attacks Evade Detection in Multi-Agent LLM Systems","primary_cat":"cs.CR","submitted_at":"2026-05-21T04:58:11+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Domain-camouflaged injection attacks reduce detection rates from 93.8% to 9.7% on Llama 3.1 8B and 100% to 55.6% on Gemini 2.0 Flash, with the gap persisting in production classifiers and multi-agent debate setups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21948","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCI-Defense: Defending Manipulation Attacks from Generative Engine Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-21T03:28:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SCI-Defense combines perplexity detection, semantic integrity scoring across four manipulation dimensions, and inter-candidate detection to counter GEO attacks, reporting perfect precision on Amazon product data but domain-limited recall on web passages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21362","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LASH: Adaptive Semantic Hybridization for Black-Box Jailbreaking of Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T16:27:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LASH adaptively composes multiple jailbreak seed prompts via genetic search over subsets and mixture weights to reach 84.5% keyword ASR and 74.5% two-stage ASR on JailbreakBench while using only 30 queries per prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20759","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Fraud Safety Evaluation: Multi-Round Attacks Reveal Safety-Utility Tradeoffs in Graph-Context LLM Defenders","primary_cat":"cs.CR","submitted_at":"2026-05-20T05:59:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Graph-context LLM fraud defenders improve early refusal under replay and adaptive multi-round attacks compared to text baselines but increase benign over-refusal, with the cost localized to how the LLM consumes structured graph fields rather than encoder quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16090","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Cross-Modal Prompt Injection Attack against Large Vision-Language Models with Image-Only Perturbation","primary_cat":"cs.CR","submitted_at":"2026-05-15T15:47:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrossMPI steers both visual and textual interpretations in LVLMs through image-only perturbations by optimizing in hidden-state space at selected middle layers with distance-based budget allocation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16471","ref_index":103,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From AI-Generated Content to Agentic Action: Security and Safety Threats in Generative AI","primary_cat":"cs.CR","submitted_at":"2026-05-15T13:53:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper analyzes evolving security and safety threats in generative AI from content generation to agentic actions, noting that attack surfaces expand faster than defenses and that many safeguards require institutional coordination not yet in place.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"4.3. Alignment, Defense, and Agentic Security 4.3.1. Alignment Techniques Modern alignment begins with RLHF [94], which combines supervised fine-tuning, reward model training, and PPOoptimization.TheInstructGPTresultssuggestthatalignmentcanpartlysubstituteforscale,as1.3BInstructGPT was preferred over 175B GPT-3. Direct Preference Optimization (DPO) [103] eliminates the separate reward model via closed-form Bradley-Terry reparameterization, becoming the dominant method by 2024. Subsequent variants address practical deployment constraints: KTO [36] requires only binary (good/bad) feedback rather than pairwise comparisons, matching DPO performance across 1-30B scales while dramatically reducing data requirements;"},{"citing_arxiv_id":"2605.15598","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Compositional Jailbreaking: An Empirical Analysis of Mutator Chain Interactions in Aligned LLMs","primary_cat":"cs.CR","submitted_at":"2026-05-15T04:14:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Systematic evaluation of all ordered pairs among twelve jailbreak mutators on harmful prompts reveals mostly destructive interference but some synergistic combinations that raise success rates on three LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13471","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sleeper Channels and Provenance Gates: Persistent Prompt Injection in Always-on Autonomous AI Agents","primary_cat":"cs.CR","submitted_at":"2026-05-13T12:57:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sleeper channels enable persistent prompt injection in always-on AI agents via persistence substrate and firing separation, countered by provenance gates using action digests and owner attestations with a soundness theorem.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"config tampering), none target the M3/M4/M5 cells we walk 1 arXiv:2605.13471v1 [cs.CR] 13 May 2026 in §VI. The upstream issue [10] proposed similar defenses and was declined upstream. III. RELATEDWORK a) Indirect prompt injection and agent benchmarks: Greshake et al. [3] introduced indirect prompt injection in a single-turn threat model. Subsequent work catalogues single- shot variants extensively [11], [12], and recent benchmarks (AgentDojo [4], ASB [13], InjecAgent [14]) exercise web-tool agents in single sessions. Our concern is whatsurvivesacross sessions, channels, and execution contexts on the always-on agent substrate, which these benchmarks do not target. b) Memory and retrieval poisoning:MemoryGraft [5] is the direct intellectual predecessor for the long-term-memory"},{"citing_arxiv_id":"2605.16407","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Proof-Carrying Certificates for LLM Pipelines: A Trust-Boundary Architecture","primary_cat":"cs.LO","submitted_at":"2026-05-13T12:01:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"partial","one_line_summary":"Introduces a trust-boundary architecture in Lean 4 with three certificate families and two operators that deliver sorry-free, axiom-audited assurances for LLM pipeline components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12746","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoT-Guard: Small Models for Strong Monitoring","primary_cat":"cs.CR","submitted_at":"2026-05-12T20:49:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoT-Guard is a 4B model using SFT and RL that achieves 75% G-mean^2 on hidden objective detection under prompt and code manipulation attacks, outperforming several larger models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Advances in neural information processing systems, 35:27730-27744, 2022. [46] Kai Greshake, Sahar Abdelnabi, Shailesh Mishra, Christoph Endres, Thorsten Holz, and Mario Fritz. Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection. InProceedings of the 16th ACM workshop on artificial intelligence and security, pages 79-90, 2023. [47] Fábio Perez and Ian Ribeiro. Ignore previous prompt: Attack techniques for language models. arXiv preprint arXiv:2211.09527, 2022. [48] An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. Qwen3 technical report.arXiv preprint arXiv:2505.09388, 2025. [49] Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang,"},{"citing_arxiv_id":"2605.11868","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"IPI-proxy: An Intercepting Proxy for Red-Teaming Web-Browsing AI Agents Against Indirect Prompt Injection","primary_cat":"cs.CR","submitted_at":"2026-05-12T09:48:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"IPI-proxy is a toolkit using an intercepting proxy to inject indirect prompt injection attacks into live web pages for testing AI browsing agents against hidden instructions.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"LLM retrieves third-party content (web pages, documents, emails) into which an attacker has planted instructions, and the model executes those instructions as though they were legitimate user requests. They demonstrated working exploits against deployed systems including Bing Chat and code-completion plugins and introduced the now-standard taxonomy of injection delivery channels (passive, active, hidden, and user-driven). Perez and Ribeiro [ 6] contributed an early empirical catalog of effective seed payloads such as the \"ignore previous instructions\" family. Liu et al. then operationalized end-to-end attacks with HouYi [ 7], a black-box pipeline that infers an application's prompt template, generates context-aware payloads, and successfully compromised 31 of 36 commercial LLM-integrated applications, providing the first concrete evidence that injection in production systems is high-"},{"citing_arxiv_id":"2605.11229","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Comment and Control: Hijacking Agentic Workflows via Context-Grounded Evolution","primary_cat":"cs.CR","submitted_at":"2026-05-11T20:45:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"JAW uses hybrid program analysis to evolve inputs that hijack agentic workflows, successfully compromising 4714 GitHub workflows and eight n8n templates to enable actions like credential exfiltration.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Overall, the results demonstrate that each component contributes meaningfully to attack effectiveness, with the largest gains arising from incorporating the trigger and capability-aware reasoning. 5.3 RQ3: Jailbreak Comparison To enable comparison betweenJA Wand prior jailbreak methodolo- gies, we implement two adaptive baseline approaches: AutoDAN- Turbo [21] and a search-based method [26] built on the evolutionary framework OpenEvolve [30]. Consistent with our ablation studies, all methods are evaluated against both model-level and code-level defenses across three workflows comprising nine model-action pairs. AutoDAN-Turbo is a black-box, agent-based jailbreak framework that identifies and refines attack strategies without dependence on"},{"citing_arxiv_id":"2605.11217","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Leveraging RAG for Training-Free Alignment of LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-11T20:29:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RAG-Pref is a training-free RAG-based alignment technique that conditions LLMs on contrastive preference samples during inference, yielding over 3.7x average improvement in agentic attack refusals when combined with offline methods across five LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10907","ref_index":41,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Engineering Robustness into Personal Agents with the AI Workflow Store","primary_cat":"cs.CR","submitted_at":"2026-05-11T17:46:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"On-the-fly ones gener- ated by isolated LLMs can be fine-grained, but assume that untrusted data serves as the only way to introduce vulnerabilities. In practice, however, this threat model is unrealistic: even if the LLM is isolated to trusted context [58], the agents can introduce security vulnerabilities even in the ab- sence of adversarial data [41] and users might unwittingly ask the agent to do something unsafe (e.g., skip authentication checks). The AI Workflow Store reintroduces traditional SE processes like requirement design and red-teaming precisely to address such issues (see discussion in §2.2). Some policy systems [27, 51, 62] allow users to interact on-the-fly to specify agent permissions."},{"citing_arxiv_id":"2605.08876","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OTora: A Unified Red Teaming Framework for Reasoning-Level Denial-of-Service in LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-05-09T10:55:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"vacy, pp. 341-352, 2024. Jin, G., Wu, S., Liu, J., Huang, T., and Mu, R. Enhancing robust fairness via confusional spectral regularization. ICLR, 2025a. Jin, G., Yi, X., Huang, W., Schewe, S., and Huang, X. S22o: Enhancing adversarial training with second-order statis- tics of weights.IEEE Transactions on Pattern Analysis and Machine Intelligence, 47(10):8630-8641, 2025b. Kumar, A., Roh, J., Naseh, A., Karpinska, M., Iyyer, M., Houmansadr, A., and Bagdasarian, E. Overthink: Slowdown attacks on reasoning llms.arXiv preprint arXiv:2502.02542, 2025. Li, X., Huang, T., Mu, R., Huang, X., and Jin, G. Pot: Inducing overthinking in llms via black-box iterative op- timization.arXiv preprint arXiv:2508.19277, 2025."},{"citing_arxiv_id":"2605.08646","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PAAC: Privacy-Aware Agentic Device-Cloud Collaboration","primary_cat":"cs.LG","submitted_at":"2026-05-09T03:29:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PAAC aligns planner-executor decomposition with the device-cloud boundary via typed placeholders and on-device sanitization, delivering 15-36% higher accuracy and 2-6x lower leakage than prior device-cloud baselines on agentic benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[31] Grégoire Mialon, Clémentine Fourrier, Thomas Wolf, Yann LeCun, and Thomas Scialom. GAIA: a benchmark for General AI Assistants. InThe Twelfth International Conference on Learning Representations, 2023. [32] Niloofar Mireshghallah, Maria Antoniak, Yash More, Yejin Choi, and Golnoosh Farnadi. Trust No Bot: Discovering Personal Disclosures in Human-LLM Conversations in the Wild.arXiv preprint arXiv:2407.11438, 2024. [33] Fábio Perez and Ian Ribeiro. Ignore Previous Prompt: Attack Techniques For Language Models. arXiv preprint arXiv:2211.09527, 2022. [34] Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. Toolformer: Language Models Can Teach Themselves to Use Tools.Advances in neural information processing systems,"},{"citing_arxiv_id":"2605.07269","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MIPIAD: Multilingual Indirect Prompt Injection Attack Defense with Qwen -- TF-IDF Hybrid and Meta-Ensemble Learning","primary_cat":"cs.CL","submitted_at":"2026-05-08T05:34:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MIPIAD reports a hybrid Qwen-TF-IDF ensemble defense that reaches F1 0.9205 and reduces the English-Bangla performance gap on a 1.43-million-sample synthetic benchmark derived from BIPIA templates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06205","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClawGuard: Out-of-Band Detection of LLM Agent Workflow Hijacking via EM Side Channel","primary_cat":"cs.CR","submitted_at":"2026-05-07T13:12:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClawGuard detects LLM agent workflow hijacking by capturing and classifying electromagnetic emanations from hardware with 0.9945 AUC, 100% true-positive rate, and 1.16% false-positive rate on a 7.82 TB RF dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05846","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LoopTrap: Termination Poisoning Attacks on LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-05-07T08:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoopTrap is an automated red-teaming framework that crafts termination-poisoning prompts to amplify LLM agent steps by 3.57x on average (up to 25x) across 8 agents.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"with tool invocations and grounds subsequent reasoning in ob- served results. This pattern has been operationalized by frame- works such as LangChain [20], OpenAI Assistants [30], and Claude Tool Use [3], while AutoGPT [36] pursues a fully autonomous, goal- directed paradigm with self-evaluated progress. Multi-agent coor- dination [6], retrieval-augmented grounding [ 23], self-reflective improvement [34], and hierarchical plan-and-execute decomposi- tion [39] further extend agent capabilities. Despite their architec- tural diversity, all these frameworks share a common execution loop: the agent (i) perceives its environment through tools or APIs, (ii) reasons over observations, (iii) selects and executes an action, and (iv) evaluates progress toward goal completion."},{"citing_arxiv_id":"2605.16336","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detecting Verbatim LLM Copy-Paste in Homework","primary_cat":"cs.CR","submitted_at":"2026-05-07T02:36:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SteganoPrompt embeds a hidden instruction in assignment prompts via the Unicode Tags block so that LLMs add a detectable signature to responses when the prompt is pasted verbatim.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04665","ref_index":10,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Paraphrase-Induced Output-Mode Collapse: When LLMs Break Character Under Semantically Equivalent Inputs","primary_cat":"cs.CL","submitted_at":"2026-05-06T09:11:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs show systematic output-mode collapse on closed-form prompts, with only ~22% of semantically equivalent variants preserving the requested bare-label format across five models and four tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"and analysis scriptsas anonymized supplementary material, enabling bit-level replay of every reported statistic. II. RELATED WORK A. LLM Robustness and Consistency The reliability of LLMs under input variations has emerged as a critical research area. Recent work has ex- amined adversarial robustness through jailbreak attacks [8] and prompt injection vulnerabilities [10], demonstrating that carefully crafted perturbations can elicit unintended behaviors. Complementary research has investigated self- consistency in multi-step reasoning [6], showing that sam- pling multiple reasoning paths can improve accuracy but also revealing significant output variance across semantically equivalent prompts. Surface-form sensitivity in benchmark evaluation has been"},{"citing_arxiv_id":"2605.03378","ref_index":116,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ARGUS: Defending LLM Agents Against Context-Aware Prompt Injection","primary_cat":"cs.CR","submitted_at":"2026-05-05T05:37:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ARGUS defends LLM agents from context-aware prompt injections by tracking information provenance and verifying decisions against trustworthy evidence, reducing attack success to 3.8% while retaining 87.5% task utility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03213","ref_index":4,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Agents Handle Secrets: A Survey of Confidential Computing for Agentic AI","primary_cat":"cs.CR","submitted_at":"2026-05-04T23:09:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey providing a taxonomy of TEE platforms, an agent-centric threat model, and open challenges for applying confidential computing to secure agentic AI systems.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"holds service credentials, invokes real-world APIs, and may communicate with peer agents in pipelines that no single party fully controls. This expanded operational scope creates a threat surface that software-layer defenses handle poorly. A prompt injection attack embedded in a retrieved document can hijack the agent's planning loop, redirecting its tool calls to attacker- controlled endpoints [3], [4]. A compromised cloud operator running the agent's container can inspect model weights, exfiltrate conversation history, or silently modify tool outputs in transit. In multi-agent systems, a single rogue agent can poison the reasoning of an entire collaborative pipeline [5]. Recent incidents illustrate that these threats are operational rather than hypothetical."},{"citing_arxiv_id":"2605.02647","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ContextualJailbreak: Evolutionary Red-Teaming via Simulated Conversational Priming","primary_cat":"cs.CL","submitted_at":"2026-05-04T14:32:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ContextualJailbreak uses evolutionary search over simulated primed dialogues with novel mutations to reach 90-100% attack success on open LLMs and transfers to some closed frontier models at 15-90% rates.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"atically identifying such vulnerabilities before deployment is a prerequisite for responsible AI development and is commonly op- erationalized throughautomated red teaming. Early jailbreak research relied on manually crafted prompt tem- plates. While useful for targeted probing, manual approaches are costly, difficult to scale, and unable to adapt to continuously updated defenses [21, 34]. This has motivated automated jailbreak gener- ation methods that frame the problem as black-box optimization over adversarial inputs. However, prevailing optimization-based attackers share a structural limitation:the optimization object is a single-turn prompt. Methods such as PAIR [3], TAP [15], GPT- Fuzz [34], Papillon [7], AutoDAN [13], and AutoDAN-Turbo [12]"},{"citing_arxiv_id":"2605.02236","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Perturbation Dose Responses in Recursive LLM Loops: Raw Switching, Stochastic Floors, and Persistent Escape under Append, Replace, and Dialog Updates","primary_cat":"cs.AI","submitted_at":"2026-05-04T05:16:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In 30-step recursive LLM loops, append-mode persistent escape from source basins reaches 50% near 400 tokens under full history but plateaus below 50% under tail-clip memory policy, while replace-mode switching largely reflects state reset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02187","ref_index":48,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Alignment Isn't Enough: Response-Path Attacks on LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-05-04T03:35:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A malicious relay can strategically rewrite aligned LLM outputs in BYOK agent architectures to achieve up to 99.1% attack success on benchmarks like AgentDojo and ASB.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Prior work on prompt injection studies how user and argentic environment text can acquire unintended control authority in LLM agents, causing unexpected or policy-violating behavior. The core failure is the collapse of the data-instruction boundary inside the model-agent reasoning loop. Existing work covers direct injection through user-facing inputs [48, 52-56, 84-86] and indirect injection through retrieved documents, web pages, tool observations, or other external artifacts [51, 83, 87-90]. Relay tampering targets a different boundary. It modifies the response delivered to the agent after model generation, making the failure one of post-alignment response integrity. Guardrails and System-Level Defenses."},{"citing_arxiv_id":"2605.01782","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Needle-in-RAG: Prompt-Conditioned Character-Level Traceback of Poisoned Spans in Retrieved Evidence","primary_cat":"cs.CR","submitted_at":"2026-05-03T08:42:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RAGCharacter localizes poisoned character spans in RAG evidence via prompt-conditioned counterfactual masking and achieves the best accuracy-over-attribution trade-off across tested attacks and models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":": MS MARCO: A human generated machine reading comprehension dataset. In: Proceedings of the Workshop on Cognitive Computation: Integrating neural and symbolic approaches 2016 co-located with the 30th Annual Conference on Neural Information Processing Systems (NIPS 2016), Barcelona, Spain, December 9, 2016. CEUR Workshop Proceedings, vol. 1773 (2016) [35] Perez, F., Ribeiro, I.: Ignore previous prompt: Attack techniques for language models. arXiv preprint arXiv:2211.09527 (2022) [36] Pruthi, G., Liu, F., Kale, S., Sundararajan, M.: Estimating training data influence by tracing gradient descent. Advances in Neural Information Processing Systems33, 19920-19930 (2020) [37] Qi, Z., Zhang, H., Xing, E.P."},{"citing_arxiv_id":"2605.12535","ref_index":14,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ghost in the Context: Measuring Policy-Carriage Failures in Decision-Time Assembly","primary_cat":"cs.CR","submitted_at":"2026-05-02T18:07:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper measures policy-carriage failures during LLM context assembly and evaluates SafeContext as a partial mitigation on Llama, Qwen, and Mistral models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01462","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LocalAlign: Enabling Generalizable Prompt Injection Defense via Generation of Near-Target Adversarial Examples for Alignment Training","primary_cat":"cs.CR","submitted_at":"2026-05-02T14:25:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LocalAlign generates near-target adversarial examples via prompting and applies margin-aware alignment training to enforce tighter boundaries against prompt injection attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01078","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Sentence Relation-Based Approach to Sanitizing Malicious Instructions","primary_cat":"cs.CR","submitted_at":"2026-05-01T20:22:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SONAR constructs a relational graph from entailment and contradiction scores to prune injected malicious sentences from LLM prompts while preserving context, achieving near-zero attack success rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.28157","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlashRT: Towards Computationally and Memory Efficient Red-Teaming for Prompt Injection and Knowledge Corruption","primary_cat":"cs.CR","submitted_at":"2026-04-30T17:43:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlashRT delivers 2x-7x speedup and 2x-4x GPU memory reduction for prompt injection and knowledge corruption attacks on long-context LLMs versus nanoGCG.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Internet), these LLMs can generate an output for the query based on the given context. For instance, in retrieval-augmented generation (RAG) systems, a long-context LLM can leverage the broad texts retrieved from a knowledge database to generate answers to user questions. However, many previous studies showed that LLM applications face various security threats, such as prompt injection [1, 2, 3, 4, 5, 6, 7, 8, 9] and knowledge corruption attacks [10, 11, 12, 13, 14, 15, 16]. Long-context LLMs are susceptible to these threats, as an adversarial text can be subtly embedded within a lengthy context, making it difficult to prevent and detect. For instance, in prompt injection, an attacker can inject an instruction into a long context such that an LLM follows the injected instruction to produce a malicious"},{"citing_arxiv_id":"2604.27132","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRUST: A Framework for Decentralized AI Service v.0.1","primary_cat":"cs.AI","submitted_at":"2026-04-29T19:32:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TRUST is a decentralized AI auditing framework that decomposes reasoning into HDAGs, maps agent interactions via the DAAN protocol to CIGs, and uses stake-weighted multi-tier consensus to achieve 72.4% accuracy while proving a Safety-Profitability Theorem that rewards honest auditors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25200","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making AI-Assisted Grant Evaluation Auditable without Exposing the Model","primary_cat":"cs.CR","submitted_at":"2026-04-28T04:10:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A TEE-based remote attestation system creates signed evaluation bundles that link input hashes, model measurements, and outputs to make AI grant reviews verifiable without revealing proprietary components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24118","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AgentVisor: Defending LLM Agents Against Prompt Injection via Semantic Virtualization","primary_cat":"cs.CR","submitted_at":"2026-04-27T07:12:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AgentVisor cuts prompt injection success rate to 0.65% in LLM agents with only 1.45% utility loss via semantic privilege separation and one-shot self-correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23887","ref_index":6,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluation of Prompt Injection Defenses in Large Language Models","primary_cat":"cs.CR","submitted_at":"2026-04-26T21:22:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Only output filtering with hardcoded rules in application code prevented prompt injection leaks in LLMs, as all model-based defenses were defeated by an adaptive attacker.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23593","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When AI reviews science: Can we trust the referee?","primary_cat":"cs.AI","submitted_at":"2026-04-26T08:03:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AI peer review systems are vulnerable to prompt injections, prestige biases, assertion strength effects, and contextual poisoning, as demonstrated by a new attack taxonomy and causal experiments on real conference submissions.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"attacks. arXiv preprint. DOI:10.48550/arXiv.2506.11113 [15] Li Y ., Jiang Y ., Li Z., et al. (2024). Backdoor learning: A survey. IEEE Trans. Neural Netw. Learn. Syst. 35:5-22. DOI:10.1109/TNNLS.2022.3182979 [16] Zhang Y ., Rando J., Evtimov I., et al. (2024). Per- sistent pre-training poisoning of llms. arXiv preprint. DOI:10.48550/arXiv.2410.13722 [17] Perez F . and Ribeiro I. (2022). Ignore previous prompt: Attack techniques for language models. arXiv preprint. DOI:10.48550/arXiv.2211.09527 [18] Shayegani E., Mamun M.A.A., Fu Y ., et al. (2023). Sur- vey of vulnerabilities in large language models revealed by adversarial attacks. arXiv preprint. DOI:10.48550/arXiv. 2310.10844 [19] Sharma M., Tong M."},{"citing_arxiv_id":"2604.23374","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ghost in the Agent: Redefining Information Flow Tracking for LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-04-25T16:39:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"NeuroTaint is the first taint tracking framework for LLM agents that uses offline auditing of semantic, causal, and persistent context to detect flows from untrusted sources to privileged sinks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21131","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cross-Session Threats in AI Agents: Benchmark, Evaluation, and Algorithms","primary_cat":"cs.CR","submitted_at":"2026-04-22T22:40:31+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces CSTM-Bench with 26 cross-session attack taxonomies, demonstrates recall loss in session-bound and full-log detectors, and proposes a bounded-memory coreset reader with the CSTM metric balancing detection and serving stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21090","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Structural Quality Gaps in Practitioner AI Governance Prompts: An Empirical Study Using a Five-Principle Evaluation Framework","primary_cat":"cs.SE","submitted_at":"2026-04-22T21:18:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new five-principle framework applied to 34 practitioner AI governance prompts finds 37% lack key structural elements such as data classification and rubrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}