{"total":56,"items":[{"citing_arxiv_id":"2606.26396","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"At the Edge of Understanding: Sparse Autoencoders Trace The Limits of Transformer Generalization","primary_cat":"cs.LG","submitted_at":"2026-06-24T21:26:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders show OOD prompts increase fallacious concept activation in transformers, offering a mechanistic measure of shift and a path to robust fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24105","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DoHFuse: A Dual-Branch Architecture with DMAGLSTM for Website Fingerprinting over DNS over HTTPS/3","primary_cat":"cs.CR","submitted_at":"2026-06-23T03:36:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DoHFuse achieves 88.05% closed-world accuracy on 449 classes and strong open-world detection using a new DoH/3 traffic dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30557","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing Isn't Knowing: Do VLMs Know When Not to Answer Spatial Questions (and Why)?","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:44:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Frontier VLMs overconfidently answer spatial questions under occlusion (~30% accuracy) and perspective ambiguity (<10% accuracy) instead of abstaining, and often fail to select helpful additional views.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29543","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCOPE: A Lightweight-training LLM Framework for Air Traffic Control Readback Monitoring","primary_cat":"cs.LG","submitted_at":"2026-05-28T07:56:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SCOPE achieves 91.05% open-set detection accuracy and corrects 96.63% of anomalous ATC readbacks via frozen LLM with plug-in classifier and in-context learning on semi-synthetic data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23797","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Debiased Negative Mining Improves Out-of-distribution Detection with Pre-trained Vision-Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-22T15:57:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Debiased negative mining via Monte-Carlo sampling from ID labels and unlabeled wild data improves OOD detection with VLMs and achieves new state-of-the-art results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21602","ref_index":15,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Benchmarking and Improving Monitors for Out-Of-Distribution Alignment Failure in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-20T18:08:21+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20725","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Holistic Reliability Propagation: Decoupling Annotation and Prediction for Robust Noisy-Label","primary_cat":"cs.CV","submitted_at":"2026-05-20T05:24:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HRP decouples annotation reliability (alpha) and pseudo-label reliability (beta) via bilevel meta-learning and routes them to distinct objectives in reliability-aware Mixup and contrastive learning for improved noisy-label robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22864","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reading Calibrated Uncertainty from Language Model Trajectories","primary_cat":"cs.LG","submitted_at":"2026-05-19T19:24:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Geometric features from per-layer MLP update trajectories fed to a sparse linear probe outperform maximum softmax probability for uncertainty quantification under selective abstention, with gains up to 21 AURC points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19369","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When to Answer and When to Defer: A Decision Framework for Reliable Code Predictions","primary_cat":"cs.SE","submitted_at":"2026-05-19T05:04:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces a unified framework integrating uncertainty estimation, calibration, and tool-based abstention for reliable code predictions in language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19365","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-the-Fly Input Adaptation for Reliable Code Intelligence","primary_cat":"cs.SE","submitted_at":"2026-05-19T04:55:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes a two-stage on-the-fly input adaptation framework to reduce mispredictions in code language models across understanding tasks without retraining or additional supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18045","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Confidence-Gated Robot Autonomy: When Does Uncertainty Actually Help?","primary_cat":"cs.RO","submitted_at":"2026-05-18T08:35:27+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Uncertainty methods yield similar gating behavior once the base model exceeds a dataset-dependent competence threshold, but threshold selection dominates outcomes and semantic OOD detection stays near chance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17575","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniAlign: A Model-Agnostic Framework for Robust Network Traffic Classification under Distribution Shifts","primary_cat":"cs.LG","submitted_at":"2026-05-17T18:02:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UniAlign improves robustness of deep learning NTC models under distribution shifts via domain alignment fine-tuning and stable ensembling, yielding 2.51% accuracy and 2.71% F1 gains over standard training on three public datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17324","ref_index":104,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ASPI: Seeking Ambiguity Clarification Amplifies Prompt Injection Vulnerability in LLM Agents","primary_cat":"cs.CR","submitted_at":"2026-05-17T08:30:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Clarification-seeking in LLM agents amplifies prompt injection attack success from ~2% to over 30% across ten frontier models in a new 728-scenario benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13161","ref_index":15,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A$_3$B$_2$: Adaptive Asymmetric Adapter for Alleviating Branch Bias in Vision-Language Image Classification with Few-Shot Learning","primary_cat":"cs.CV","submitted_at":"2026-05-13T08:24:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A3B2 introduces an adaptive asymmetric adapter with uncertainty-aware dampening to reduce branch bias in few-shot vision-language image classification and outperforms standard adapter and prompt methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11920","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Domain Restriction via Multi SAE Layer Transitions","primary_cat":"cs.AI","submitted_at":"2026-05-12T10:36:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-layer SAE transitions capture domain-specific signatures that distinguish OOD texts in Gemma-2 models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Markov (default).From ID data we count adjacent-layer co-activations Cℓ(i, j) = #{x∈ D ID :i∈A ℓ−1(x), j∈ Aℓ(x)} and marginals Nℓ(i) =P j Cℓ(i, j). With Laplace smoothingα, we set pℓ(j|i) = Cℓ(i, j) +α Nℓ(i) +αD .(8) At test time we score transitions by the negative mean log- likelihood over active pairs: aℓ(x) =− 1 |Aℓ−1||Aℓ| X i∈Aℓ−1 X j∈Aℓ logp ℓ(j|i),(9) 3 Domain Restriction via SAE Layer Transitions Attention Block Attention Block Attention Block Input Layer 1 Layer 2 Decoder-Only Transformer Layer 3 Last Layer 03 47 0 00 0 9 05 10 0 SAE TopK Sequence first state second state final state 00 0 1 01 1 0 01 1 0 ANOMALY HTM / Markov Chain / RNN Scorer Figure 1.Pipeline overview. Given an input, we extract residual-stream activations across layers, encode them with layer-wise SAEs, pool"},{"citing_arxiv_id":"2605.11383","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HamBR: Active Decision Boundary Restoration Based on Hamiltonian Dynamics for Learning with Noisy Labels","primary_cat":"cs.CV","submitted_at":"2026-05-12T01:14:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HamBR uses Spherical HMC to probe ambiguous regions and synthesize virtual outliers with energy-based repulsion to restore decision boundaries degraded by noisy labels, achieving SOTA on CIFAR and real-world benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Our work establishes explicit energy barriers in the feature space by introducing virtual outliers as negative pivots, thereby geometrically compressing the distribution within classes and isolating noisy samples. 2.3 Virtual Outlier Synthesis and Manifold Regularization Virtual Outlier Synthesis (VOS) [10] has been extensively investi- gated in OOD detection [15], with the aim of regularizing decision boundaries by generating virtual samples in low-density regions. However, the direct transfer of traditional VOS techniques, typi- cally based on GANs or simple Gaussian sampling, to LNL faces a dual challenge: excessive computational overhead for generation or the inability to capture complex manifold topologies precisely"},{"citing_arxiv_id":"2605.08618","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Toy Benchmarks: A Systematic Evaluation of OOD Detection Methods For Plant Pathology Classification","primary_cat":"cs.CV","submitted_at":"2026-05-09T02:27:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Energy-based fine-tuning outperforms other OOD detection methods on the real-world Plant Pathology 2021 dataset, improving detection over softmax while maintaining in-distribution accuracy.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"The goal is for the model to learn heuristics that distinguish in- from out-of-distribution inputs in a way that generalizes to unseen OOD distributions at test time. For our setting, where the baseline OOD detector is maximum softmax probability,L OE is defined as the cross-entropy between the model's predictive distribution and the uniform distribution overC classes: LOE(f(x ′)) =− 1 C CX c=1 log ˆpc(x′)(5) This penalizes the model for assigning high confidence to any single class on OOD inputs, encour- aging flat, uncertain predictions. Training usesD train id andD train ood simultaneously via two dataloaders, with the OOD dataloader cycled to match the length of the ID dataloader. The OOD score at infer- ence is identical to E1:S(x) =−max c ˆpc. 5"},{"citing_arxiv_id":"2605.08302","ref_index":32,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SGC-RML: A reliable and interpretable longitudinal assessment for PD in real-world DNS","primary_cat":"cs.LG","submitted_at":"2026-05-08T12:10:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SGC-RML creates an 8D symptom atlas from multimodal PD data and integrates conformal calibration to deliver reliable, rejectable longitudinal assessments.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Can you trust your model's uncertainty? Evaluating predictive uncertainty under dataset shift. InAdvances in Neural Information Processing Systems, volume 32, 2019. [31] D. Hendrycks and K. Gimpel. A baseline for detecting misclassified and out-of-distribution examples in neural networks. InInternational Conference on Learning Representations, 2017. arXiv: 1610.02136. [32] R. El-Yaniv and Y . Wiener. On the foundations of noise-free selective classification.Journal of Machine Learning Research, 11(53):1605-1641, 2010. [33] M. Pakdaman Naeini, G. F. Cooper, and M. Hauskrecht. Obtaining well calibrated probabilities using Bayesian binning. InProceedings of the AAAI Conference on Artificial Intelligence, 29(1), 2015. doi: 10."},{"citing_arxiv_id":"2605.05776","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HEDP: A Hybrid Energy-Distance Prompt-based Framework for Domain Incremental Learning","primary_cat":"cs.AI","submitted_at":"2026-05-07T07:09:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HEDP uses energy regularization inspired by Helmholtz free energy plus hybrid energy-distance weighting in prompts to improve domain selection and achieve a 2.57% accuracy gain on benchmarks like CORe50 while mitigating catastrophic forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05638","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling Pretrained Representations Enables Label-Free Out-of-Distribution Detection Without Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-07T03:45:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scaling pretrained representations improves label-free OOD detection on frozen backbones, causing performance gaps between global and local detectors to vanish across vision and language tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05121","ref_index":56,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond Semantics: An Evidential Reasoning-Aware Multi-View Learning Framework for Trustworthy Mental Health Prediction","primary_cat":"cs.CL","submitted_at":"2026-05-06T16:49:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A multi-view evidential framework combines semantic and reasoning information to improve accuracy and provide trustworthy uncertainty estimates for mental health prediction on text data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02544","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Improving Model Safety by Targeted Error Correction","primary_cat":"cs.AI","submitted_at":"2026-05-04T12:47:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A dual GBDT error classifier reduces dangerous misclassifications by 12-34% on medical and animal image datasets with under 2% added latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01632","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Perturb and Correct: Post-Hoc Ensembles using Affine Redundancy","primary_cat":"cs.LG","submitted_at":"2026-05-02T22:48:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Perturb-and-Correct generates epistemically diverse predictors from a single pretrained network via hidden-layer perturbations followed by affine least-squares corrections that enforce agreement on calibration data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01502","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RADMI: Latent Information Aggregation as a Proxy for Model Uncertainty","primary_cat":"cs.CV","submitted_at":"2026-05-02T15:49:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RADMI aggregates mutual information across decoder layers to proxy epistemic uncertainty in segmentation networks, showing the highest correlation with deep ensemble baselines among single-pass methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00640","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Knowing when to trust machine-learned interatomic potentials","primary_cat":"cs.LG","submitted_at":"2026-05-01T13:21:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PROBE recasts MLIP uncertainty quantification as selective classification by training a compact discriminative classifier on frozen per-atom backbone embeddings, yielding a reliability probability that tracks actual error better than ensemble disagreement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00350","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CURE-OOD: Benchmarking Out-of-Distribution Detection for Survival Prediction","primary_cat":"cs.CV","submitted_at":"2026-05-01T02:17:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CURE-OOD is the first benchmark for evaluating OOD detection in survival prediction under controlled CT acquisition shifts, showing that standard detectors often fail and providing a survival-aware baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26409","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sparsity as a Key: Unlocking New Insights from Latent Structures for Out-of-Distribution Detection","primary_cat":"cs.CV","submitted_at":"2026-04-29T08:23:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Sparse autoencoders on ViT class tokens reveal stable Class Activation Profiles for in-distribution data, enabling OOD detection via divergence from core energy profiles.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25591","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Walking Through Uncertainty: An Empirical Study of Uncertainty Estimation for Audio-Aware Large Language Models","primary_cat":"eess.AS","submitted_at":"2026-04-28T12:56:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Semantic-level and verification-based uncertainty methods outperform token-level baselines for audio reasoning in ALLMs, but their relative performance on hallucination and unanswerable-question benchmarks is model- and task-dependent.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"answers [53], hallucinated content [31], [32], [53]-[55], or overly confident responses [44], [56], especially when the audio evidence is ambiguous, incomplete, or even insufficient to answer the question. This limitation highlights a fundamental gap: beyond ac- curacy, we need to understand whether a model knows when it may be wrong. Uncertainty estimation [57]-[79] provides a natural framework for addressing this problem. Reliable uncertainty estimates can support error detection, selective prediction, calibration, and safer deployment. In text-only LLMs, a large body of work [62], [65]-[79] has explored uncertainty estimation through predictive entropy, semantic entropy [65], and self-verification-based measures such as"},{"citing_arxiv_id":"2604.23342","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Empirical Insights of Test Selection Metrics under Multiple Testing Objectives and Distribution Shifts","primary_cat":"cs.SE","submitted_at":"2026-04-25T15:05:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A broad empirical benchmark shows how 15 existing test selection metrics perform for fault detection, performance estimation, and retraining under corrupted, adversarial, temporal, natural, and label shifts across image, text, and Android data.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"68% 10,000 Original 4 Udacity [76] Dave2V1 [10] 2,116,983 96.35%1 5,614 Original 5 Dave2V2 [22] 2,116,983 95.67% 5,614 Original 6 Dave2V3 [71] 3,276,225 97.06% 5,614 Original 7 Epoch [27] 18,969,66598.41% 5,614 Original 8 AndroZoo [3] DeepDrebin [44]2,404,802 99.23% 21,336 Original 9 BasicDNN [43] 1,626,081 99.22% 21,336 Original 10 IMDb [52] Linear [31] 640,033 87.48% 25,000 Original 11 LSTM [17] 692,785 85.47% 25,000 Original 12 GRU [17] 680,753 84.68% 25,000 Original 13 Transformer [63]653,566 87.57% 25,000 Original 14 MNIST-C [62] LeNet-5 [42] 89,698 92.07% 10,000 Corrupted covariate shift 15 MNIST-Adv LeNet-5 [42] 89,698 50.06% 10,000 Adversarial covariate shift 16 MNIST-label LeNet-5 [42] 89,698 98."},{"citing_arxiv_id":"2604.17822","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GR4CIL: Gap-compensated Routing for CLIP-based Class Incremental Learning","primary_cat":"cs.CV","submitted_at":"2026-04-20T05:20:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GR4CIL introduces gap-compensated routing to enable reliable task-aware knowledge routing in CLIP-based class incremental learning while preserving zero-shot generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16745","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Why Training-Free Token Reduction Collapses: The Inherent Instability of Pairwise Scoring Signals","primary_cat":"cs.AI","submitted_at":"2026-04-17T23:26:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pairwise scoring signals in Vision Transformer token reduction are inherently unstable due to high perturbation counts and degrade in deep layers, causing collapse, while unary signals with triage enable CATIS to retain 96.9% accuracy at 63% FLOPs reduction on ViT-Large ImageNet-1K.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15741","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning Uncertainty from Sequential Internal Dispersion in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-17T06:31:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SIVR detects LLM hallucinations by learning from token-wise and layer-wise variance patterns in internal hidden states, outperforming baselines with better generalization and less training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10718","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SciPredict: Can LLMs Predict the Outcomes of Scientific Experiments in Natural Sciences?","primary_cat":"cs.AI","submitted_at":"2026-04-12T16:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs predict outcomes of real scientific experiments at 14-26% accuracy, comparable to human experts, but lack calibration on prediction reliability while humans demonstrate strong calibration.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"define our benchmark, which is essential to assess real-world performance. AI/ML research benchmarks.Recent benchmarks have begun evaluating LLMs on tasks that simulate the AI research cycle itself, extending beyond problem-solving or knowledge recall. [28, 37, 45, 52] evaluate LLMs for their ability to reproduce masked or full code repositories and experiment results given existing ML papers. [17] takes this a step further by evaluating how well LLMs can write experiment code for novel research ideas not seen during training. [8, 18, 20] evaluate agents on machine learning engineering tasks, assessing their ability to iteratively modify algorithms and improve performance across various datasets and tasks. [26] focuses on research methodology, requiring LLMs to predict"},{"citing_arxiv_id":"2604.08827","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Quantum Patches: Enhancing Robustness of Quantum Machine Learning Models","primary_cat":"quant-ph","submitted_at":"2026-04-09T23:57:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Random quantum circuits used as adversarial training data reduce successful attack rates on QML models for CIFAR-10 from 89.8% to 68.45% and for CINIC-10 from 94.23% to 78.68%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08627","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evidential Transformation Network: Turning Pretrained Models into Evidential Models for Post-hoc Uncertainty Estimation","primary_cat":"cs.LG","submitted_at":"2026-04-09T16:09:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ETN is a lightweight post-hoc module that applies a learned sample-dependent affine transformation to pretrained model logits and interprets the outputs as Dirichlet parameters to enable efficient uncertainty estimation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08261","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DBMF: A Dual-Branch Multimodal Framework for Out-of-Distribution Detection","primary_cat":"cs.CV","submitted_at":"2026-04-09T13:48:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DBMF integrates scores from text-image and vision branches to improve out-of-distribution detection on endoscopic datasets by up to 24.84% over prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08192","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Inside-Out: Measuring Generalization in Vision Transformers Through Inner Workings","primary_cat":"cs.LG","submitted_at":"2026-04-09T12:44:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Circuit-based metrics from Vision Transformer internals provide better label-free proxies for generalization under distribution shift than existing methods like model confidence.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"checkpoints under diverse hyperparameter settings. Details are available in Appendix B. Baselines.We compare our circuit metrics against base- lines from three categories: (1) ID-based Metrics that ana- lyze the model on source data (i.e., ID Accuracy [49] and Sharpness [3]); (2) OOD-based Metrics that analyze output probability distribution on target data (i.e., Average Con- fidence [29], Average Negative Entropy (ANE) [29] and Meta-Distribution Energy (MDE) [54]); or analyze feature quality on target data (i.e., RANKME [20] andα-ReQ [2]); and (3) ID vs. OOD Comparison Metrics (ATC [19]). Evaluation protocol.We quantify each metric's pre- dictive power by its correlation with true OOD perfor- mance, measured by Accuracy for PACS and Camelyon17,"},{"citing_arxiv_id":"2605.12517","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging the Missing-Modality Gap: Improving Text-Only Calibration of Vision Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-03T10:03:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new Latent Imagination Module uses cross-attention to predict latent visual embeddings from text, improving accuracy and calibration of vision-language models on text-only inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.20410","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SLE-FNO: Single-Layer Extensions for Task-Agnostic Continual Learning in Fourier Neural Operators","primary_cat":"cs.LG","submitted_at":"2026-03-20T18:30:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SLE-FNO achieves zero forgetting and strong plasticity-stability balance in continual learning for FNO surrogate models of pulsatile blood flow by adding minimal single-layer extensions across four out-of-distribution tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"typically causes catastrophic forgetting on the prior learned data distribution [19], while limited fine-tuning will still under-perform on the new OOD task. This challenge motivates the devel- opment of methods that can detect distribution shifts [15], identify OOD inputs, and monitor or preserve model reliability under evolving desired input conditions [14, 20-23]. Such challenges are particularly significant in SciML applications where small distributional changes can reflect mean- ingful physical differences due to nonlinearity, and computational costs or data access limitations often prohibit retraining from scratch. This naturally connects to the broader need for continual 2 learning (CL) [24, 25], robust adaptation, and efficient mechanisms to update surrogate models as"},{"citing_arxiv_id":"2603.07462","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Machines Fail Like Humans? A Human-Centred Out-of-Distribution Spectrum for Mapping Error Alignment","primary_cat":"cs.AI","submitted_at":"2026-03-08T04:51:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A human-centered OOD spectrum based on perceptual difficulty shows vision-language models align best with human errors across regimes, with CNNs stronger on near-OOD and ViTs on far-OOD.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.05719","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unsupervised domain adaptation for radioisotope identification in gamma spectroscopy","primary_cat":"cs.LG","submitted_at":"2026-03-05T22:19:55+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unsupervised domain adaptation via feature alignment raises radioisotope identification accuracy on real LaBr3 gamma spectra from 0.754 to 0.904 for models trained only on synthetic data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.10644","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Universal Spatial Transcriptomics Super-Resolution: A Generalist Physically Consistent Flow Matching Framework","primary_cat":"q-bio.BM","submitted_at":"2026-02-11T08:44:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SRast is a generalist framework using self-supervised decoupling of gene and spatial representations plus flow matching for physically consistent super-resolution of spatial transcriptomics data with strong zero-shot generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.14505","ref_index":53,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Uncovering and Understanding FPR Manipulation Attack in Industrial IoT Networks","primary_cat":"cs.CR","submitted_at":"2026-01-20T21:57:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"FPR manipulation attack perturbs benign MQTT packets to flip labels to attacks in NIDS with 80-100% success, increasing SOC delays without gradient-based methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.11727","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Asymptotically Optimal Tests for One- and Two-Sample Problems","primary_cat":"cs.IT","submitted_at":"2026-01-16T19:20:35+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.11934","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Systematic Analysis of Out-of-Distribution Detection Under Representation and Training Paradigm Shifts","primary_cat":"cs.LG","submitted_at":"2025-11-14T23:18:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Benchmark across architectures and shift regimes finds OOD detector rankings shift with representation collapse; proposes NC-based shortlist predictor and PCA filter without extra OOD data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.17381","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Binary Out-of-Distribution Detection: Characterizing Distributional Shifts with Multi-Statistic Diffusion Trajectories","primary_cat":"cs.LG","submitted_at":"2025-10-20T10:18:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DISC extracts multi-statistic trajectories from diffusion denoising to both detect and classify types of distributional shifts in OOD data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.25080","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards a Certificate of Trust: Task-Aware OOD Detection for Scientific AI","primary_cat":"cs.LG","submitted_at":"2025-09-29T17:21:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A score-based diffusion model estimates joint likelihoods of inputs and regression predictions to detect out-of-distribution cases in scientific tasks, with the likelihood correlating to prediction error.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.12982","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Out of Distribution Detection in Self-adaptive Robots with AI-powered Digital Twins","primary_cat":"cs.RO","submitted_at":"2025-09-16T11:43:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ODiSAR uses a Transformer digital twin with reconstruction error and Monte Carlo dropout to detect OOD events in self-adaptive robots, reporting up to 98% AUROC on office navigation and maritime ship tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.09926","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LoFT: Parameter-Efficient Fine-Tuning for Long-tailed Semi-Supervised Learning in Open-World Scenarios","primary_cat":"cs.LG","submitted_at":"2025-09-12T02:28:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LoFT uses parameter-efficient fine-tuning of foundation models for long-tailed semi-supervised learning, supported by proofs that this reduces hypothesis complexity to minimize balanced posterior error and compresses outlier acceptance regions, with LoFT-OW handling open-world OOD cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.19607","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Contrastive Residual Energy Test-time Adaptation","primary_cat":"cs.LG","submitted_at":"2025-05-26T07:21:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CreTTA reformulates test-time adaptation of marginal distributions as residual energy learning, producing a contrastive objective that cancels the partition function and uses relative energy differences for adaptive gradient reweighting to avoid overfitting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}