{"total":71,"items":[{"citing_arxiv_id":"2605.23614","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The frame problem in quantitative practice: ontological uncertainty and epistemic humility in an age of automated inference","primary_cat":"stat.ME","submitted_at":"2026-05-22T13:21:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A synthetic review arguing that frame (ontological) uncertainty is structurally invisible within quantitative models and drives most consequential failures in automated inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21731","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"I-SAFE: Wasserstein Coherence Metrics for Structural Auditing of Scientific AI Models","primary_cat":"cs.LG","submitted_at":"2026-05-20T20:44:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"I-SAFE uses Wasserstein Coherence Metrics to audit distributional coherence of scientific AI models under structurally guided perturbations, revealing differences among DTI predictors that accuracy metrics miss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21683","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Investigating Concept Alignment Using Implausible Category Members","primary_cat":"cs.AI","submitted_at":"2026-05-20T19:41:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AI models misalign with humans on concept boundaries when probed with implausible category members, such as classifying words as vehicles or vegetables as fruit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20159","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Interpretable Computer Vision for Defect Detection in X-ray Tomography of Aerospace SiC/SiC Composites","primary_cat":"cs.CV","submitted_at":"2026-05-19T17:46:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"p-ResNet-50 adds a prototype layer with anchor- and medoid-based regularizations to ResNet-50, achieving ROC-AUC 0.994 and accuracy 0.957 on ~12k XCT patches while supplying case-based explanations aligned to expert categories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20081","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging the Disciplinary Gap in Explainable AI: From Abstract Desiderata to Concrete Tasks","primary_cat":"cs.CY","submitted_at":"2026-05-19T16:35:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The authors introduce a taxonomy with target, functional role, and mode of justification axes plus a framework that decomposes abstract XAI desiderata into concrete benchmarkable tasks via identified dependency structures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19848","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CLIF: Concept-Level Influence Functions for Transparent Bottleneck Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T13:42:38+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16844","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Artificial Adaptive Intelligence: The Missing Stage Between Narrow and General Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-16T07:04:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes Artificial Adaptive Intelligence as the regime between narrow and general AI, defined by elimination of human-specified hyperparameters, and introduces an adaptivity index plus parametric minimality principle grounded in minimum description length.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15965","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Entropy-Based Characterisation of the Polarised Regime in Latent Variable Models","primary_cat":"cs.LG","submitted_at":"2026-05-15T13:55:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An entropy criterion on mean representations characterises the polarised regime in VAEs and related models, with theoretical links to KL minimisation and empirical tests across several architectures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13627","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SINAPSE: A lightweight deep learning framework for accurate and explainable neutron-$\\gamma$ discrimination","primary_cat":"physics.ins-det","submitted_at":"2026-05-13T14:53:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SINAPSE uses a dual-branch neural network with a 1D convolutional autoencoder for denoising and a classifier for neutron-gamma discrimination, trained via random augmentations on high-SNR data and validated with SHAP explanations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12809","ref_index":175,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Correcting Influence: Unboxing LLM Outputs with Orthogonal Latent Spaces","primary_cat":"cs.LG","submitted_at":"2026-05-12T23:01:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A latent mediation framework with sparse autoencoders enables non-additive token-level influence attribution in LLMs by learning orthogonal features and back-propagating attributions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11161","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Interpretability Can Be Actionable","primary_cat":"cs.LG","submitted_at":"2026-05-11T19:08:21+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Interpretability research should be judged by actionability—the degree to which its insights support concrete decisions and interventions—rather than explanatory power alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10930","ref_index":6,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluating the False Trust Engendered by LLM Explanations","primary_cat":"cs.HC","submitted_at":"2026-05-11T17:58:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM reasoning traces and post-hoc explanations increase false trust in incorrect predictions, whereas contrastive dual explanations enhance users' ability to distinguish correct from incorrect AI outputs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"a system which doesn't speak the end-user's language ?\". ∗Equal contribution; author order decided by coin toss. Preprint. arXiv:2605.10930v1 [cs.HC] 11 May 2026 LLMs change the setting quite a bit. They communicate in natural language and readily provide a palatable explanation of their generated answers, while not providing any guarantees of correctness or faithfulness [6, 7, 8, 9]. We observe this in a new crop of language models called Large Reasoning Models (LRMs) which generate a sequence of intermediate tokens (colloquially or perhaps fancifully called a \"Chain of Thought\" or \"reasoning trace\") before producing an answer sequence [10]. These reasoning traces have often been interpreted as the intermediate steps which the model took to arrive"},{"citing_arxiv_id":"2605.10601","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Open-Box Fallacy: Why AI Deployment Needs a Calibrated Verification Regime","primary_cat":"cs.AI","submitted_at":"2026-05-11T14:02:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AI deployment in high-stakes areas requires domain-scoped calibrated verification with monitoring and revocation, using a proposed six-component Verification Coverage standard instead of mechanistic interpretability.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"Criminal justice illustrates low Verification Coverage in a high-stakes setting. InState v. Loomis, the Wisconsin Supreme Court allowed consideration of COMPAS with limitations and cautions [38]. Subsequent work found that COMPAS did not outperform a simple two-feature model or crowd-aggregated lay judgments on the Broward County recidivism-prediction task [10], and related scholarship has criticized the secrecy and procedural unfairness of COMPAS and similar proprietary recidivism risk tools [ 29]. Criminal justice may have a named institutional decision-maker, but it lacks strong external verification, monitoring, and contestability when proprietary risk scores enter bail, sentencing, or parole workflows."},{"citing_arxiv_id":"2605.10054","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Explanation-Aware Learning for Enhanced Interpretability in Biomedical Imaging","primary_cat":"cs.CV","submitted_at":"2026-05-11T06:27:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Adding explanation supervision to training improves spatial alignment of saliency maps with clinical annotations on chest X-rays while keeping predictive accuracy comparable.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Guan, \"A survey on explainable artificial intelligence (xai): Toward medical xai,\"IEEE Transactions on Neural Networks and Learning Systems, vol. 32, no. 11, pp. 4793-4813, 2021. [9] Z. Faruqui, M. S. McIntire, R. Dubey, and J. McEntee, \"Explainability of cnn based classification models for acoustic signal,\" 2025. [Online]. Available: https://arxiv.org/abs/2509.08717 [10] F. Doshi-Velez and B. Kim, \"Towards a rigorous science of interpretable machine learning,\"arXiv preprint arXiv:1702.08608, 2017. [11] W. Samek, T. Wiegand, and K.-R. M ¨uller, \"Explainable artificial in- telligence: Understanding, visualizing and interpreting deep learning models,\"ITU Journal: ICT Discoveries, vol. 1, no. 1, 2019. [12] R. R. Selvarajuet al."},{"citing_arxiv_id":"2605.08482","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ShifaMind: A Multiplicative Concept Bottleneck for Interpretable ICD-10 Coding","primary_cat":"cs.LG","submitted_at":"2026-05-08T20:58:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ShifaMind achieves competitive performance with the LAAT baseline on MIMIC-IV top-50 ICD-10 coding while outperforming vanilla concept bottleneck models and providing concept-mediated explanations.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"matched Vanilla CBM on all three metrics, with non-overlapping bootstrap 95% confidence intervals. where copos(c, j) ={i: ˜c ic = 1, y ij = 1} and r denotes the representation input to the diagnosis head (pc for SHIFAMIND, ˆc for the Vanilla CBM; the literal input to each model's diagnosis head). Gradients are computed analytically (Section G). Metric 3: CCR (Concept-Conditioned Recall).Following Doshi-Velez and Kim [4], CCR evaluates whether the model recovers diagnoses when relevant concepts are present. For each pair(c, j), CCRc,j =P(ˆyj = 1|y j = 1,˜cc = 1) i.e., recall of labeljrestricted to samples where conceptcis present. All three metrics are non-negative, and higher values indicate stronger concept-supported predictive behavior. CSTPR and CCR are rates in [0,1] , while CIM is a gradient-norm sensitivity measure"},{"citing_arxiv_id":"2605.04410","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evaluation Cards for XAI Metrics","primary_cat":"cs.CV","submitted_at":"2026-05-06T02:06:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The authors introduce the XAI Evaluation Card template to standardize how XAI evaluation metrics are defined, validated, and reported.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03808","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Agentic-imodels: Evolving agentic interpretability tools via autoresearch","primary_cat":"cs.AI","submitted_at":"2026-05-05T14:35:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Agentic-imodels evolves scikit-learn regressors via an autoresearch loop to jointly boost predictive performance and LLM-simulatability, improving downstream agentic data science tasks by up to 73% on the BLADE benchmark.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Each response is then graded against the ground truth (with numerical tolerance), yielding a binary pass/fail per test. For simplicity, Fig. 2 shows four tests using the same synthetic data / model, but in actuality, the tests span different synthetic datasets / models to cover diverse scenarios. Drawing from prior work on evaluating interpretability via human experiments [ 20, 22, 18], we develop a total of 200 tests grouped into six categories: • Feature attribution(32 tests) asks which features matter: identifying the most important feature, ranking features, detecting irrelevant ones, and determining the sign of effects. • Point simulation(43 tests) asks the LLM to predict the model's output for a specific input, ranging"},{"citing_arxiv_id":"2605.02044","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NeuroViz: Real-time Interactive Visualization of Forward and Backward Passes in Neural Network Training","primary_cat":"cs.LG","submitted_at":"2026-05-03T20:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NeuroViz offers interactive real-time visualization of neural network forward and backward passes, achieving top usability scores in a study with 31 participants compared to existing tools.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02962","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ISAAC: Auditing Causal Reasoning in Deep Models for Drug-Target Interaction","primary_cat":"cs.LG","submitted_at":"2026-05-03T06:36:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ISAAC auditing applied to three DTI models on the Davis benchmark finds 25% relative differences in causal reasoning scores despite nearly identical AUROC values.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01189","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NEURON: A Neuro-symbolic System for Grounded Clinical Explainability","primary_cat":"cs.AI","submitted_at":"2026-05-02T02:00:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NEURON raises AUC from 0.74-0.77 to 0.84-0.88 on MIMIC-IV heart-failure mortality prediction while lifting human-aligned explanation scores from 0.50 to 0.85 by grounding SHAP values in SNOMED CT and patient notes via RAG-LLM.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"explaining the predictions for a specific data point, while global refers to explaining the overall model behavior [1, 8, 27]. One well-known and widely used XAI technique is SHAP (SHapley Additive exPlanations) [59, 74], a model-agnostic feature-attribution method that uses graphs and other visual modalities to show which features contribute most to the final output. It can be used for both local and global interpretability [27, 74]. Explainability techniques like SHAP still provide only static feature-attribution visualizations that are difficult for clinicians to interpret [52, 56, 78]. Furthermore, these techniques do not incorporate clinical knowledge, nor do they offer a mechanism for interactive or narrative reasoning. This hinders the ability to trace predictions back to clinically grounded knowledge, thereby limiting trust"},{"citing_arxiv_id":"2605.01164","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLMs Should Not Yet Be Credited with Decision Explanation","primary_cat":"cs.AI","submitted_at":"2026-05-01T23:46:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLMs support decision prediction and rationale generation but lack evidence for genuine decision explanation, requiring stricter standards to avoid over-crediting.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"If the task is to communicate a possible reason, summarize a pattern, assist annotation, or generate hypotheses, a fluent and psychologically literate rationale may be useful. LLMs are particularly strong at this kind of reason-like text generation; for example, in computational social science tasks, free-form LLM outputs can sometimes produce explanations judged stronger than crowdworker references [30]. The problem is that plausibility is not source tracking. Interpretability work distinguishes explanations that are plausible to readers from explanations that are faithful to the process or model being explained [14]. Cognitive science gives the same warning from the human side: people may lack direct introspective access to the processes that shaped their judgments [12]; verbal reports can be useful"},{"citing_arxiv_id":"2604.27354","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CoAX: Cognitive-Oriented Attribution eXplanation User Model of Human Understanding of AI Explanations","primary_cat":"cs.AI","submitted_at":"2026-04-30T03:12:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Cognitive models of user reasoning strategies with XAI methods on tabular data fit human forward-simulation decisions better than ML baselines and support hypothesis testing without new user studies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25649","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Towards interpretable AI with quantum annealing feature selection","primary_cat":"cs.LG","submitted_at":"2026-04-28T13:47:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Quantum annealing solves a combinatorial optimization problem to select key CNN feature maps, yielding more class-disentangled explanations than GradCAM or GradCAM++.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24326","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"X-NegoBox: An Explainable Privacy-Budget Negotiation Framework for Secure Peer-to-Peer Energy Data Exchange","primary_cat":"cs.CR","submitted_at":"2026-04-27T11:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"X-NegoBox is a proposed explainable framework that negotiates privacy budgets for energy data exchange using trust, sensitivity, and purpose factors, with experiments claiming reduced leakage and higher acceptance rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23896","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Trust to Appropriate Reliance: Measurement Constructs in Human-AI Decision-Making","primary_cat":"cs.HC","submitted_at":"2026-04-26T21:47:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A literature review shows that constructs for appropriate reliance on AI are fragmented, presents three views on the topic, and calls for consensus on objective metrics to enable better comparisons across studies.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Many studies (45%) did not use tasks that were relevant to the users' expertise. It can also be argued that measurements may miss capturing users' reliance if they provide low-effort results [31]. Despite ensuring crowd workers can execute tasks [23], it is also useful to employ reliance measurements with domain users for decision-making tasks. Existing work [24, 65] also highlights that running complex tasks with crowd workers may not be entirely representative and therefore may not report actual measurement of appropriate reliance on AI advice. Hence, further work is required to understand the assessment of metrics for tasks that are properly representative of decision-making scenarios explored in studies for comparative evaluations across contexts."},{"citing_arxiv_id":"2604.22662","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking XAI Evaluation: A Human-Centered Audit of Shapley Benchmarks in High-Stakes Settings","primary_cat":"cs.LG","submitted_at":"2026-04-24T15:38:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"In high-stakes settings, Shapley explanations increase analyst confidence but do not improve decision accuracy, and standard metrics fail to predict human utility.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18449","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Awareness to Intent: Mitigating Silent Driving System Failures through Prospective Situation Awareness Enhancing Interfaces","primary_cat":"cs.HC","submitted_at":"2026-04-20T16:04:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Prospective situation awareness enhancing interfaces delivered via AR HUD improve takeover performance after silent automation failures, with perceptual cues most effective at raising situational awareness and system-intent messages best at building trust.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18256","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Domain-Specialized Object Detection via Model-Level Mixtures of Experts","primary_cat":"cs.CV","submitted_at":"2026-04-20T13:30:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Model-level MoE of domain-specialized YOLO detectors with gating network outperforms standard ensembles on BDD100K while revealing expert specialization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14240","ref_index":147,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Interpretable and Explainable Surrogate Modeling for Simulations: A State-of-the-Art Survey and Perspectives on Explainable AI for Decision-Making","primary_cat":"cs.AI","submitted_at":"2026-04-15T03:25:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"This survey synthesizes XAI methods with surrogate modeling workflows for simulations and outlines a research agenda to embed explainability into simulation-driven design and decision-making.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"techniques from other disciplines (such as statis- tics and applied mathematics) also fall under the umbrella of explainability, particularly when they contribute to understanding model behavior or providing insights into input-output relation- ships. Another important distinction lies between model-specific and model-agnostic explainability methods [ 39, 146, 147]. As the names imply, model-specific methods are designed for particu- lar types of models. For instance, the uncertainty estimates provided by GP are inherent to its probabilistic structure [ 19]. In contrast, model- agnostic methods are more flexible and can be applied to a wide range of ML models, regardless of their internal architecture [ 148]."},{"citing_arxiv_id":"2604.13252","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Out of Context: Reliability in Multimodal Anomaly Detection Requires Contextual Inference","primary_cat":"cs.LG","submitted_at":"2026-04-14T19:32:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Multimodal anomaly detection must be reframed as cross-modal contextual inference that separates context from observations to define abnormality conditionally.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11467","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Attribution to Action: A Human-Centered Application of Activation Steering","primary_cat":"cs.AI","submitted_at":"2026-04-13T13:41:57+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10658","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Governed Reasoning for Institutional AI","primary_cat":"cs.AI","submitted_at":"2026-04-12T14:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Cognitive Core uses nine typed cognitive primitives, a four-tier governance model with human review as an execution condition, and an endogenous audit ledger to reach 91% accuracy with zero silent errors on prior authorization appeals, outperforming ReAct and Plan-and-Solve baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09799","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Explainable Human Activity Recognition: A Unified Review of Concepts and Mechanisms","primary_cat":"cs.LG","submitted_at":"2026-04-10T18:20:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper delivers a mechanism-centric taxonomy and unified perspective on explainable human activity recognition methods across sensing modalities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07392","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Event-Centric World Modeling with Memory-Augmented Retrieval for Embodied Decision-Making","primary_cat":"cs.LG","submitted_at":"2026-04-08T06:14:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"An event-centric framework encodes environments as semantic events and retrieves weighted prior maneuvers from a knowledge bank to enable interpretable, physics-aware decision-making for UAVs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04671","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Design Guidelines for Game-Based Refresher Training of Community Health Workers in Low-Resource Contexts","primary_cat":"cs.HC","submitted_at":"2026-04-06T13:30:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A four-year mixed-methods study of game-based systems for Indian CHWs yields eight design guidelines for sustained engagement, learning transfer, and contextual appropriateness in low-resource health training.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"signed for students or the general population. Our findings high- light professional identity as a design dimension alongside usability and engagement. 4.4 Explainability and Trust in Decision Support Participants' demand for explanations rather than binary correct- ness. This aligns with emerging work on explainable AI and trans- parent decision support systems in healthcare [9]. Even in non-AI game-based systems, CHWs sought to understand why an answer was correct to justify their decisions to families and supervisors. Explainability supported reflective learning and reinforced their confidence. This finding suggests that game-based training systems should be designed as reflective learning environments rather than automated"},{"citing_arxiv_id":"2604.09622","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Explainability and Certification of AI-Generated Educational Assessments","primary_cat":"cs.CY","submitted_at":"2026-03-18T11:33:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A framework using self-rationalization, attribution analysis, and a certification metadata schema with traffic-light workflow enables transparent, audit-ready AI-generated educational assessments aligned to Bloom's and SOLO taxonomies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.15250","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"In-Context Symbolic Regression for Robustness-Improved Kolmogorov-Arnold Networks","primary_cat":"cs.LG","submitted_at":"2026-03-16T13:21:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In-context symbolic regression methods improve robustness of symbolic formula recovery from KANs, cutting median OFAT test MSE by up to 99.8 percent across hyperparameter sweeps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.12748","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"X-SYS: A Reference Architecture for Interactive Explanation Systems","primary_cat":"cs.AI","submitted_at":"2026-02-13T09:24:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"X-SYS is a reference architecture for interactive explanation systems organized around STAR quality attributes and five service components, demonstrated via SemanticLens for vision-language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.11897","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic AI for Cybersecurity: A Meta-Cognitive Architecture for Governable Autonomy","primary_cat":"cs.CR","submitted_at":"2026-02-12T12:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A meta-cognitive agentic framework coordinates specialized cybersecurity agents through a judgment mechanism to improve decision quality under uncertainty and noise on standard benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.18696","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Explainability Methods for Hardware Trojan Detection: A Systematic Comparison","primary_cat":"cs.LG","submitted_at":"2026-01-26T17:13:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Compares domain-aware, case-based, and feature attribution explainability methods for gate-level hardware Trojan detection on the Trust-Hub benchmark dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16304","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Results-Actionability Gap: Understanding How Practitioners Evaluate LLM Products in the Wild","primary_cat":"cs.SE","submitted_at":"2026-01-25T10:36:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Qualitative study of 19 practitioners reveals ten LLM product evaluation practices and introduces the results-actionability gap as a key barrier to turning findings into improvements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.12109","ref_index":100,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Neuro-Symbolic Framework for Accountability in Public-Sector AI","primary_cat":"cs.CY","submitted_at":"2025-12-13T00:53:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A framework combining legal ontology, rule extraction, and solver reasoning verifies whether AI explanations for CalFresh eligibility align with statutory constraints.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.00164","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Faster Verified Explanations for Neural Networks","primary_cat":"cs.LG","submitted_at":"2025-11-28T19:05:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FaVeX accelerates verified explanations for neural networks via dynamic batch-sequential processing and query reuse while introducing verifier-optimal robust explanations that incorporate verifier incompleteness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.10161","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DenoGrad: A Gradient-Based Framework for Data Refinement in Tabular and Time-Series Learning","primary_cat":"cs.AI","submitted_at":"2025-11-13T10:16:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DenoGrad refines noisy tabular and time-series data by optimizing inputs via gradients from a fixed model, yielding better downstream predictions on ten real-world datasets while preserving data statistics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.04903","ref_index":158,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficacy Analysis in Clinical Trials: A Comprehensive Review of Statistical and Machine Learning Approaches","primary_cat":"stat.OT","submitted_at":"2025-11-07T01:05:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":0.0,"formal_verification":"none","one_line_summary":"A review summarizing parametric, nonparametric, Bayesian, and machine learning methods for efficacy analysis in clinical trials and identifying gaps such as high-dimensional data and missingness.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"One key direction involves developing robust and computationally efficient models for high-dimensional longitudinal data that integrate multimodal inputs such as imaging, genomics, and digital health records while preserving statistical validity [157, 4]. Another important challenge is balancing predictive accuracy with causal interpretability, as machine learning and deep learning frameworks become central to trial analytics [158, 159]. Integrative approaches that couple traditional inferential rigor with scalable representation learning, such as hybrid Bayesian-machine learning models, offer promising pathways toward interpretable yet flexible inference. The rapid rise of virtual and decentralized clinical trials (VCTs), enabled by telehealth, wearables, and remote monitoring, introduces new"},{"citing_arxiv_id":"2511.01680","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Interpretable Discoveries from Unstructured Data: A High-Dimensional Multiple Hypothesis Testing Approach","primary_cat":"econ.EM","submitted_at":"2025-11-03T15:42:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A new framework combines AI-derived concept embeddings with high-dimensional selective inference to enable statistically principled, interpretable discovery from unstructured data in empirical economics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.01411","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Extremal Contours: Gradient-driven contours for compact visual attribution","primary_cat":"cs.CV","submitted_at":"2025-11-03T10:02:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A training-free method using Fourier-parameterized star-convex contours optimized via gradients to generate compact, faithful visual attributions for image classifiers on benchmarks like ImageNet.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.14528","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Johnny Can't Use Agents: Industry Aspirations vs. User Realities with AI Agents","primary_cat":"cs.HC","submitted_at":"2025-09-18T01:51:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Industry markets AI agents for orchestration, creation, and insight, but a usability study with 31 participants reveals users face challenges from capability misalignment and lack of meta-cognition in tools like Operator and Manus.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.07674","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temporal Counterfactual Explanations of Behaviour Tree Decisions","primary_cat":"cs.RO","submitted_at":"2025-09-09T12:40:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A method automatically constructs a causal model from behavior tree structure and domain knowledge to generate real-time causal counterfactual explanations for robot decisions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.03738","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mechanistic Interpretability with Sparse Autoencoder Neural Operators","primary_cat":"cs.LG","submitted_at":"2025-09-03T21:57:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SAE-NOs extend sparse autoencoders to function spaces via Fourier neural operators with concept and domain sparsity, learning localized patterns more efficiently and generalizing across discretizations on vision data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}