{"total":66,"items":[{"citing_arxiv_id":"2606.00654","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Invitation Trap: Proactive Availability Backdoor in LLMs via Conversational Induction","primary_cat":"cs.CR","submitted_at":"2026-05-30T09:57:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper presents Proactive Availability Backdoor (PAB) attacks on LLMs that achieve 73.1% effective success rate by proactively inducing users via suggestions in a Five-Factor Model simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28890","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Echoes within the Reasoning: Stealthy and Effective Watermarking via Chain of Thought","primary_cat":"cs.CR","submitted_at":"2026-05-27T07:44:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BiCoT embeds watermarks into the internal geometry of Chain-of-Thought reasoning traces in LLMs via private signature subspace alignment and introduces Robust Subspace Registration for black-box verification under attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23411","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sample-wise Targeted Adversarial Attacks on Test-time Adaptation","primary_cat":"cs.LG","submitted_at":"2026-05-22T09:18:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes meta-learning attack with priority-aware gradient alignment for sample-wise targeted attacks on TTA that maintain label distribution consistency with no-attack baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22481","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Stronger Triggers Backfire: A High-Dimensional Theory of Backdoor Attacks","primary_cat":"cs.LG","submitted_at":"2026-05-21T13:39:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"In the proportional high-dimensional regime, stronger backdoor training triggers improve clean accuracy and make attack success non-monotonic for regularized GLMs on Gaussian mixtures, with closed-form proofs for squared loss and fixed-point extensions to convex losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21146","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Detecting Trojaned DNNs via Spectral Regression Analysis","primary_cat":"cs.CR","submitted_at":"2026-05-20T13:19:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIST detects Trojaned DNN updates by measuring spectral deviations in pre-activation representations against a benign fine-tuning reference, achieving high accuracy across datasets and attacks after a single update.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19478","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Exposing Functional Fusion: A New Class of Strategic Backdoor in Dynamic Prompt Architectures","primary_cat":"cs.CR","submitted_at":"2026-05-19T07:29:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"VIPER exposes Functional Fusion in dynamic prompt architectures, enabling a backdoor that resists pruning by tightly integrating attack and utility parameters in the same high-magnitude core.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19227","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Token by Token, Compromised: Backdoor Vulnerabilities in Unified Autoregressive Models","primary_cat":"cs.CR","submitted_at":"2026-05-19T00:55:18+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ToBAC is the first backdoor attack on unified autoregressive models, using data or model poisoning to make triggers elicit cross-modal malicious behavior in text and image generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19147","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Be Kind, Rewrite: Benign Projections via Rewriting Defend Against LLM Data Poisoning Attacks","primary_cat":"cs.CR","submitted_at":"2026-05-18T21:56:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OBBR projects poisoned samples into benign space via rewriting with open-book examples, raising safety performance by 51% on average versus prior defenses across five attacks and four LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18646","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Language-Switching Triggers Take a Latent Detour Through Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-18T16:53:54+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18908","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fast and Lightweight Backdoor Detection via Head Random Probing","primary_cat":"cs.CR","submitted_at":"2026-05-17T16:05:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HTell detects backdoors by random probing of the model head, reporting 99.03% true positive rate and 2.11% false positive rate at 12.69 ms per model on a benchmark of over 6700 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18907","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Lightweight and Fast Backdoor Model Detection","primary_cat":"cs.CR","submitted_at":"2026-05-17T16:02:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DFBScanner detects backdoors by combining anomaly indicators from final-layer parameters into a Trojan clue score, reporting 97.17% true-positive rate, 0.95% false-positive rate, and 1 ms average detection time on a benchmark of over 5,000 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16227","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LymphNode: A Plug-and-Play Access Control Method for Deep Neural Networks","primary_cat":"cs.CR","submitted_at":"2026-05-15T17:38:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LymphNode enforces default-deny access control on DNNs by injecting GSUAP into the feature space to neutralize utility for unauthorized queries and selectively restore it for authorized inputs carrying a stealthy credential, using under 100 samples from surrogate data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15172","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MetaBackdoor: Exploiting Positional Encoding as a Backdoor Attack Surface in LLMs","primary_cat":"cs.CR","submitted_at":"2026-05-14T17:56:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MetaBackdoor shows that LLMs can be backdoored using positional triggers like sequence length, enabling stealthy activation on clean inputs to leak system prompts or trigger malicious behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13764","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VectorSmuggle: Steganographic Exfiltration in Embedding Stores and a Cryptographic Provenance Defense","primary_cat":"cs.CR","submitted_at":"2026-05-13T16:44:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Steganographic exfiltration attacks succeed on embedding stores via retrieval-preserving perturbations such as small-angle orthogonal rotation, but an Ed25519-based provenance signature closes the attack class.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13265","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LightSplit: Practical Privacy-Preserving Split Learning via Orthogonal Projections","primary_cat":"cs.LG","submitted_at":"2026-05-13T09:45:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LightSplit uses non-invertible orthogonal projections as an information bottleneck in split learning to reduce transmitted dimensionality by 32x while retaining more than 95% accuracy and limiting reconstruction risk.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13235","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Intelligence Delivery Network: Toward an Internet Architecture for the AI Age","primary_cat":"cs.NI","submitted_at":"2026-05-13T09:24:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IDN proposes treating AI intelligence as deliverable network services positioned dynamically across distributed compute environments to improve efficiency, latency, and privacy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11612","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Emotion Becomes Trigger: Emotion-style dynamic Backdoor Attack Parasitising Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T06:42:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Paraesthesia is an emotion-style dynamic backdoor attack achieving ~99% success rate on instruction and classification tasks across four LLMs while preserving clean performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This method uses emotional style injected into clean text as the trigger to steer LLMs toward the attack target. •Extensive experiments show that Paraesthesia achieves a 99% attack success rate across both task types and four different models, while maintaining the clean utility of the models. II. RELATEDWORK A. Backdoor Attack Early backdoor attacks (e.g., BadNets [7]) for computer vision) manipulate models using pixel-based triggers embed- ded in images. These attacks were later extended to LLMs in the text domain. By injecting rare characters or fixed trigger patterns into training data, LLMs perform attacker-defined actions under specific triggers while retaining normal perfor- mance on clean inputs. Later research moved beyond explicit"},{"citing_arxiv_id":"2605.09397","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BadDLM: Backdooring Diffusion Language Models with Diverse Targets","primary_cat":"cs.CR","submitted_at":"2026-05-10T07:50:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BadDLM implants effective backdoors in diffusion language models across concept, attribute, alignment, and payload targets by exploiting denoising dynamics while preserving clean performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"whether the response contains specific patterns forBadDLMConcept and BadDLMPayload; and following [39, 59], we use LLM-as-a-judge to determine whether the response matches the target for BadDLMAttribute and BadDLMAlign. We leave the detailed LLM-as-a-judge method in the Appendix B.5. To evaluate utility preservation on benign inputs, we use the MMLU benchmark [22] under the 5-shot setting. We additionally report the results on GSM8K [11], HumanEval [5], MATH [23], ARC-C [10], and MMLU-Pro [52] to show the general preservation. Implementation.We fine-tune the models with LoRA using a learning rate of 1×10 −5, a weight decay of 1×10 −4, and a batch size of 32 on NVIDIA A100 GPUs. We validate that our framework"},{"citing_arxiv_id":"2605.07490","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cross-Modal Backdoors in Multimodal Large Language Models","primary_cat":"cs.CR","submitted_at":"2026-05-08T09:29:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Poisoning a single connector in MLLMs establishes a reusable latent backdoor pathway that transfers across modalities with over 95% attack success rate under bounded perturbations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"representations enter this aligned space, compromising the connector provides a centralized and highly efficient mechanism for manipulating cross-modal model behavior. Backdoor attacks pose a significant threat to connector-based MLLMs, as a compromised connector can remain dormant during routine inspection while preserving malicious behavior that is activated only under specific conditions [ 33], [12], [13]. Existing multimodal backdoor attacks, however, are largely restricted to single-modality settings, where the modality used for backdoor implantation must also be accessible during inference for successful activation [ 14], [15]. This requirement substantially limits their practical threat, as deployed MLLM services often expose only selected modality combinations, or"},{"citing_arxiv_id":"2605.07324","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Activation Differences Reveal Backdoors: A Comparison of SAE Architectures","primary_cat":"cs.CL","submitted_at":"2026-05-08T06:30:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Differential SAEs isolate backdoor features far better than Crosscoders, reaching a Backdoor Isolation Score of 0.40 with perfect precision while Crosscoders stay below 0.02.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06846","ref_index":14,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Narrow Secret Loyalty Dodges Black-Box Audits","primary_cat":"cs.CR","submitted_at":"2026-05-07T18:48:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"poison fractions.Poison-fraction characterisation:models trained at 12.5%, 6.25%, and 3.125% poison fraction in benign training data, paired with dataset monitoring at the same fractions. 2 Related Work Backdoor attacks on language models.Backdoor attacks train models to associate a specific trigger with a target behaviour while behaving normally otherwise [14, 16, 20]. Carlini et al. [5] establish that poisoning web-scale training datasets is practical at modest cost, and recent work has demonstrated that language model backdoors are remarkably data-efficient: Souly et al. [19] show that around 250 poisoned documents suffice during pretraining for models up to 13B pa- rameters. Hubinger et al. [15] demonstrate that backdoors can persist through safety training, and"},{"citing_arxiv_id":"2605.05977","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BehaviorGuard: Online Backdoor Defense for Deep Reinforcement Learning","primary_cat":"cs.AI","submitted_at":"2026-05-07T10:21:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BehaviorGuard detects backdoor behaviors in DRL policies via behavioral drift in action distributions and suppresses suspicious actions at runtime, claimed as the first online defense for both single- and multi-agent settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04209","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Undetectable Backdoors in Model Parameters: Hiding Sparse Secrets in High Dimensions","primary_cat":"cs.CR","submitted_at":"2026-05-05T18:48:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparse Backdoor plants a provably undetectable backdoor in neural network weights via structured sparse perturbations and isotropic Gaussian dithering, with detection hardness reduced to Sparse PCA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02255","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"On the Privacy of LLMs: An Ablation Study","primary_cat":"cs.CR","submitted_at":"2026-05-04T06:06:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Privacy attacks on LLMs show strong signals for membership inference and backdoors but weaker performance for attribute inference and data extraction, with risks highly dependent on system configuration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01298","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Checkerboard: A Simple, Effective, Efficient and Learning-free Clean Label Backdoor Attack with Low Poisoning Budget","primary_cat":"cs.CR","submitted_at":"2026-05-02T07:14:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Checkerboard derives a closed-form checkerboard trigger for clean-label backdoor attacks that achieves over 94% ASR with poisoning rates as low as 0.46% on ImageNet-100 and 99.99% ASR with 20 samples on CIFAR-10.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27781","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Grand Software Supply Chain of AI Systems","primary_cat":"cs.SE","submitted_at":"2026-04-30T12:21:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AI systems lack verifiability, versioning, observability, and traceability in their software supply chains, shown by dependency analysis of 48 projects yielding 4,664 direct and 11,508 transitive dependencies totaling 392M lines of code.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24162","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Defusing the Trigger: Plug-and-Play Defense for Backdoored LLMs via Tail-Risk Intrinsic Geometric Smoothing","primary_cat":"cs.CR","submitted_at":"2026-04-27T08:18:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TIGS detects backdoor-induced attention collapse in LLMs and applies content-aware tail-risk screening plus intrinsic geometric smoothing to suppress attacks while preserving normal performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23338","ref_index":117,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Systematic Survey of Security Threats and Defenses in LLM-Based AI Agents: A Layered Attack Surface Framework","primary_cat":"cs.CR","submitted_at":"2026-04-25T14:57:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new 7x4 taxonomy organizes agentic AI security threats by architectural layer and persistence timescale, revealing under-explored upper layers and missing defenses after surveying 116 papers.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"content to an attacker-controlled address over an extended period before detection [35]. B. Supply Chain Attacks: Beyond MCP The agent software supply chain encompasses model check- points, Python/npm packages (LangChain, OpenAI SDK, agent orchestration frameworks), prompt templates, and training datasets. Each component is a potential attack vector: Model checkpoint backdoors[34], [117]-[120]: Fine-tuned agent models distributed through model hubs (Hugging Face, custom registries) can contain trigger-activated malicious be- havior. Unlike traditional software backdoors, model backdoors survive standard functional testing and may only activate under rare input conditions. Shadow Alignment [42] demonstrates that inserting a small number of adversarial fine-tuning examples"},{"citing_arxiv_id":"2604.22117","ref_index":157,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PermaFrost-Attack: Stealth Pretraining Seeding(SPS) for planting Logic Landmines During LLM Training","primary_cat":"cs.LG","submitted_at":"2026-04-23T23:32:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Stealth Pretraining Seeding plants persistent unsafe behaviors in LLMs via diffuse poisoned web content that activates on precise triggers and evades standard evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21416","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CSC: Turning the Adversary's Poison against Itself","primary_cat":"cs.CR","submitted_at":"2026-04-23T08:30:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CSC identifies backdoored samples via early-epoch latent clustering and conceals them by relabeling to a virtual class, driving attack success rates near zero on benchmarks with little clean accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20047","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PASTA: A Patch-Agnostic Twofold-Stealthy Backdoor Attack on Vision Transformers","primary_cat":"cs.CV","submitted_at":"2026-04-21T23:04:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PASTA enables patch-agnostic backdoor activation in ViTs via multi-location trigger insertion during training and bi-level optimization, achieving 99.13% average attack success with large gains in visual/attention stealthiness and defense robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12446","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling Exposes the Trigger: Input-Level Backdoor Detection in Text-to-Image Diffusion Models via Cross-Attention Scaling","primary_cat":"cs.CR","submitted_at":"2026-04-14T08:31:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SET detects input-level backdoors in T2I diffusion models by learning a benign cross-attention response space from clean samples and flagging deviations under multi-scale perturbations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12359","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Compiling Activation Steering into Weights via Null-Space Constraints for Stealthy Backdoors","primary_cat":"cs.CR","submitted_at":"2026-04-14T06:48:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A method compiles a behavioral steering vector into persistent weight edits via null-space projection, enabling stealthy and reliable backdoors in LLMs that trigger only on specific inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10403","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Latent Instruction Representation Alignment: defending against jailbreaks, backdoors and undesired knowledge in LLMs","primary_cat":"cs.LG","submitted_at":"2026-04-12T01:37:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LIRA aligns latent instruction representations in LLMs to defend against jailbreaks, backdoors, and undesired knowledge, blocking over 99% of PEZ attacks and achieving optimal WMDP forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09748","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Backdoors in RLVR: Jailbreak Backdoors in LLMs From Verifiable Reward","primary_cat":"cs.CR","submitted_at":"2026-04-10T09:32:34+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLVR can be backdoored with under 2% poisoned data using an asymmetric reward trigger, implanting jailbreaks that cut safety performance by 73% on average without harming benign tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09101","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CLIP-Inspector: Model-Level Backdoor Detection for Prompt-Tuned CLIP via OOD Trigger Inversion","primary_cat":"cs.CR","submitted_at":"2026-04-10T08:33:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CLIP-Inspector reconstructs OOD triggers to detect backdoors in prompt-tuned CLIP models with 94% accuracy and higher AUROC than baselines, plus a repair step via fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08766","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Follow My Eyes: Backdoor Attacks on VLM-based Scanpath Prediction","primary_cat":"cs.CR","submitted_at":"2026-04-09T21:06:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Backdoor attacks on VLM-based scanpath predictors can redirect fixations toward chosen objects or inflate durations using input-conditioned triggers that evade cluster detection, and no tested defense blocks them without hurting clean accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08395","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Phantasia: Context-Adaptive Backdoors in Vision Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-09T15:55:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Phantasia is a new backdoor attack on VLMs that dynamically aligns malicious outputs with input context to achieve higher stealth and state-of-the-art success rates compared to static-pattern attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06840","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MirageBackdoor: A Stealthy Attack that Induces Think-Well-Answer-Wrong Reasoning","primary_cat":"cs.CR","submitted_at":"2026-04-08T09:02:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MirageBackdoor is the first backdoor attack that preserves clean chain-of-thought reasoning in LLMs while steering the final answer to a specific incorrect target under a trigger.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04488","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Patch-based Cross-view Regularized Framework for Backdoor Defense in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-06T07:27:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A patch-augmented cross-view regularization method reduces backdoor attack success rates in multimodal LLMs by enforcing output differences between original and perturbed views while using entropy constraints to preserve benign generation quality.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":": Med-flamingo: a multimodal medical few-shot learner. In: Machine Learning for Health (ML4H), pp. 353-367 (2023). PMLR [18] Sima, C., Renz, K., Chitta, K., Chen, L., Zhang, H., Xie, C., Beißwenger, J., Luo, P., Geiger, A., Li, H.: Drivelm: Driving with graph visual question answering. In: European Conference on Computer Vision, pp. 256-274 (2024). Springer [19] Gu, T., Dolan-Gavitt, B., Garg, S.: Badnets: Identifying vulnera- bilities in the machine learning model supply chain. arXiv preprint arXiv:1708.06733 (2017) [20] Huang, H., Zhao, Z., Backes, M., Shen, Y., Zhang, Y.: Composite backdoor attacks against large lan- guage models. In: Findings of the Association for Computational Lin- guistics: NAACL 2024, pp."},{"citing_arxiv_id":"2604.16424","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Safety, Security, and Cognitive Risks in State-Space Models: A Systematic Threat Analysis with Spectral, Stateful, and Capacity Attacks","primary_cat":"cs.CR","submitted_at":"2026-04-04T13:08:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"State-space models are vulnerable to three new attack types that corrupt state integrity, with experiments showing up to 156x output changes and 6x higher targeted corruption than random inputs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"training, our attack exploits thetransfer function structureof SSM layers-specifically the poles of (e jω I − A)−1 determined by HiPPO initialisation (Proposition 3.1, Remark 3.2)-which has no CNN analogue. The spectral bound is an H∞-norm instantiation [50] with SSM-specific novelty in identifying HiPPO pole locations as the exploitable structure. Backdoor and Poisoning Attacks.BadNets [ 17] establishes backdoors in CNNs; [ 6] surveys training-time poisoning for foundation models. Sleeper agents-LLMs with deceptive goals persisting through safety fine-tuning- are demonstrated in [ 22]. Recurrent backdoors in NLP RNNs and LSTMs have been studied by Chen et al. [ 7] (word-level triggers), Yao et al. [46] (latent backdoors surviving fine-tuning), and Salem et al."},{"citing_arxiv_id":"2604.03081","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Supply-Chain Poisoning Attacks Against LLM Coding Agent Skill Ecosystems","primary_cat":"cs.CR","submitted_at":"2026-04-03T14:58:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"DDIPE poisons LLM agent skills by embedding malicious logic in documentation examples, achieving 11.6-33.5% bypass rates across frameworks while explicit attacks are blocked, with 2.5% evading detection.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"pipeline creates the attack surface that our threat model (Section 3) formalizes. 2.2 Related Work LLM supply-chain security.LLM supply-chain attacks have progressively moved from model internals toward the post- deployment extension ecosystem. Early work focused on poison- ing pre-training corpora [10, 35]. Subsequent studies implanted backdoors directly in model weights [18, 34]. With the rise of agent paradigms, the attack surface shifted further outward. Greshake et al. [17] showed that compromised third-party applications can manipulate an LLM through its context window. More recent work has begun to probe agent-specific risks: ToolTweak [39] manip- ulates tool-selection rankings, and Skill-Inject [37] demonstrates"},{"citing_arxiv_id":"2604.02372","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Backdoor Attacks on Decentralised Post-Training","primary_cat":"cs.CR","submitted_at":"2026-03-31T16:00:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"An adversary controlling an intermediate pipeline stage in decentralized LLM post-training can inject a backdoor that reduces alignment from 80% to 6%, with the backdoor persisting in 60% of cases even after subsequent safety training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.13864","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Inevitable Encounters: Backdoor Attacks Involving Lossy Compression","primary_cat":"cs.CR","submitted_at":"2026-03-14T09:45:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ROI coding enables backdoor triggers to survive lossy compression by embedding malicious information into binary bitstreams via sample-specific or customized masks for both learned and traditional codecs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.07200","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BadSNN: Backdoor Attacks on Spiking Neural Networks via Adversarial Spiking Neuron","primary_cat":"cs.CR","submitted_at":"2026-02-06T21:20:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BadSNN injects backdoors into spiking neural networks by adversarially tuning LIF neuron hyperparameters and optimizing triggers, achieving higher attack success than prior data-poisoning methods while remaining robust to common defenses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.15474","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BadImplant: Injection-based Multi-Targeted Graph Backdoor Attack","primary_cat":"cs.LG","submitted_at":"2026-01-21T21:23:51+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"BadImplant is the first multi-targeted backdoor attack on GNN graph classification that uses subgraph injection to achieve high success rates on multiple target labels with minimal clean accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.22046","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Backdoor Attacks on Prompt-Driven Video Segmentation Foundation Models","primary_cat":"cs.CV","submitted_at":"2025-12-26T14:48:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BadVSFM is the first effective backdoor attack on prompt-driven video segmentation foundation models, using a two-stage encoder-decoder strategy to achieve high attack success rates with limited clean performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.10998","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCOUT: A Defense Against Data Poisoning Attacks in Fine-Tuned Language Models","primary_cat":"cs.CR","submitted_at":"2025-12-10T17:25:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCOUT uses token saliency analysis to detect both standard and contextually-plausible backdoor attacks in language models while maintaining clean accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.20792","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BadGraph: A Backdoor Attack Against Latent Diffusion Model for Text-Guided Graph Generation","primary_cat":"cs.LG","submitted_at":"2025-10-23T17:54:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BadGraph poisons training data with textual triggers to implant backdoors in latent diffusion models for text-guided graph generation, achieving 50% attack success rate at under 10% poisoning and over 80% at 24% poisoning with negligible clean performance loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.06896","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Are Targeted Data Poisoning Attacks as Effective as We Think?","primary_cat":"cs.LG","submitted_at":"2025-09-08T17:14:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper introduces clean-model-based metrics that stratify test samples by vulnerability to targeted poisoning, enabling worst-case attack evaluation and vulnerability-aware defenses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}