{"total":16,"items":[{"citing_arxiv_id":"2606.23840","ref_index":63,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Embodied Explainability and Ontological Obstacles: Why We Struggle to Explain the Answers of Large Language Models (LLMs)","primary_cat":"cs.HC","submitted_at":"2026-06-22T18:22:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An argument paper reframes LLM explainability as an embodied, situated practice based on Dourish and enactivist cognition, identifying ontological obstacles in internal explanations and advocating affordance-based designs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20369","ref_index":38,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CATCH-ME if you RAG: a dataset of Contextually Annotated multi-Turn Counterspeech against Hate and Misinformation Exchanges","primary_cat":"cs.CL","submitted_at":"2026-06-18T15:32:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Presents a new expert-curated dataset of multi-turn counterspeech dialogues in five languages targeting hate against seven groups, with span annotations linking to verified external knowledge for RAG applications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19815","ref_index":10,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Clusters are All You Need: Pre-Training the Tsetlin Machine with Semantic Clusters from Language Models for Interpretability","primary_cat":"cs.CL","submitted_at":"2026-06-18T05:43:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A clustering-based pre-training step transfers semantic knowledge from language models into Tsetlin Machines, yielding competitive accuracy with BERT while preserving clause-level interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18989","ref_index":23,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"G-IdiomAlign: A Gloss-Pivoted Benchmark for Cross-Lingual Idiom Alignment","primary_cat":"cs.CL","submitted_at":"2026-06-17T12:09:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"G-IdiomAlign is a gloss-pivoted benchmark with multiple-choice and generation protocols for evaluating cross-lingual idiom alignment in LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12689","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Observable Patterns Are Not Explanations: A Causal-Geometric Analysis of Latent Reasoning Models","primary_cat":"cs.CL","submitted_at":"2026-06-10T21:23:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Evaluation of two latent reasoning models against controls shows observable latent patterns appear without the proposed mechanisms, have graded causal effects on behavior, and concentrate in structured low-rank directions, arguing that patterns are insufficient evidence for reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11445","ref_index":162,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Forecasting Future Behavior as a Learning Task","primary_cat":"cs.AI","submitted_at":"2026-06-09T20:56:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Behavior Forecasters trained on LRM trajectories outperform larger models in predicting repeatability and input sensitivity at low cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10627","ref_index":25,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Profy: Interpretable Visualization of Expertise-Dependent Motor Skills Toward Supporting Piano Practice","primary_cat":"cs.HC","submitted_at":"2026-06-09T09:28:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Profy uses take-level expert-amateur labels on 1083 piano recordings to produce time-aligned highlight scores that correlate with expert review points (r=0.61) on held-out amateur clips.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05875","ref_index":25,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"QCFuse: Query-Aware Cache Fusion via Compressed View for Efficient RAG Serving","primary_cat":"cs.AI","submitted_at":"2026-06-04T08:47:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QCFuse achieves full-prefill quality in RAG with 1.7x average prefill speedup over full prefill and 1.5x over ProphetKV via compressed query-aware cache fusion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22435","ref_index":26,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Assisted Counterspeech Writing at the Crossroads of Hate Speech and Misinformation","primary_cat":"cs.CL","submitted_at":"2026-05-21T13:02:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs generate adequate counterspeech for co-occurring hate and misinformation in 40% of cases, with a mixed knowledge strategy from fact-checkers and NGOs proving most effective after expert revision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08302","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SGC-RML: A reliable and interpretable longitudinal assessment for PD in real-world DNS","primary_cat":"cs.LG","submitted_at":"2026-05-08T12:10:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SGC-RML creates an 8D symptom atlas from multimodal PD data and integrates conformal calibration to deliver reliable, rejectable longitudinal assessments.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"A unified approach to interpreting model predictions. InAdvances in Neural Information Processing Systems, volume 30, 2017. [21] S. Jain and B. C. Wallace. Attention is not Explanation. InProceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 3543-3556, 2019. doi: 10.18653/v1/N19-1357. [22] S. Wiegreffe and Y . Pinter. Attention is not not Explanation. InProceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pages 11-20, 2019. doi: 10.18653/v1/D19- 1002. [23] P. E. Shrout and J. L. Fleiss. Intraclass correlations: Uses in assessing rater reliability."},{"citing_arxiv_id":"2605.07509","ref_index":21,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MASPrism: Lightweight Failure Attribution for Multi-Agent Systems Using Prefill-Stage Signals","primary_cat":"cs.SE","submitted_at":"2026-05-08T09:40:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MASPrism attributes failures in multi-agent systems by ranking candidates from prefill-stage NLL and attention signals of a 0.6B SLM, beating baselines by up to 33.41% Top-1 accuracy and proprietary LLMs by up to 89.5% relative improvement while processing traces in 2.66 seconds.","context_count":1,"top_context_role":"method","top_context_polarity":"background","context_text":"toms to earlier candidate sources, without relying on costly agent workflows, execution replay, or trained tracers during attribution. 6.3 Software Fault Localization and Trace Diagnosis A common paradigm in software debugging and fault localization is to rank suspicious entities based on failure evidence. A prominent example is traditional fault localization, where spectrum-based tech- niques such as Tarantula [21] and Ochiai [1] rank code statements or branches by analyzing execution patterns from test cases. This ranking perspective extends to trace-level diagnosis in complex software systems, where root cause analysis is performed on struc- tured execution traces. For instance, RepTrace [32] analyzes system call traces through causality analysis, while TraceContrast [46] uses"},{"citing_arxiv_id":"2605.02142","ref_index":67,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ORBIT: Learning Gene Program Co-Activation Structure for Cell-Type-Stratified Pathway Rewiring Analysis in Single-Cell Transcriptomics","primary_cat":"q-bio.GN","submitted_at":"2026-05-04T01:50:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ORBIT uses an intervention-consistent self-supervised objective in a transformer to infer asymmetric gene program influences from observational scRNA-seq data, recovering Alzheimer's vulnerability patterns and achieving 0.984 macro F1 cell-type classification from 220 pathway scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05834","ref_index":36,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hidden in the Multiplicative Interaction: Uncovering Fragility in Multimodal Contrastive Learning","primary_cat":"cs.LG","submitted_at":"2026-04-07T13:03:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multimodal contrastive learning using multilinear products is fragile to single bad modalities, and a gated version improves top-1 retrieval accuracy on synthetic and real trimodal data.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Association for Computational Linguistics. doi: 10.18653/v1/N19-1357. URL https:// aclanthology.org/N19-1357/. [35] Ziyu Jiang, Guoqing Zheng, Yu Cheng, Ahmed Hassan Awadallah, and Zhangyang Wang. Cr-moe: Consistent routed mixture-of-experts for scaling contrastive learning.Transactions on Machine Learning Research, 2024. ISSN 2835-8856. URL https://openreview.net/ forum?id=qKIvn9xL1R. [36] Michael I. Jordan and Robert A. Jacobs. Hierarchical mixtures of experts and the em algorithm. Neural Computation, 6(2):181-214, 1994. doi: 10.1162/neco.1994.6.2.181. [37] Arun Jose. Reasoning models sometimes output illegible chains of thought. InThe Thirty- ninth Annual Conference on Neural Information Processing Systems, 2025. URL https: //openreview."},{"citing_arxiv_id":"2604.16410","ref_index":6,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Matched-Learning-Rate Analysis of Attention Drift and Transfer Retention in Fine-Tuned CLIP","primary_cat":"cs.LG","submitted_at":"2026-04-01T06:35:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Matched learning-rate experiments show LoRA retains substantially higher zero-shot transfer (45% vs 11% on EuroSAT, 58% vs 9% on Pets) than Full FT in CLIP adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.00593","ref_index":70,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Interpretability in the Wild: a Circuit for Indirect Object Identification in GPT-2 small","primary_cat":"cs.LG","submitted_at":"2022-11-01T17:08:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"GPT-2 small solves indirect object identification via a circuit of 26 attention heads organized into seven functional classes discovered through causal interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2112.04426","ref_index":118,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Improving language models by retrieving from trillions of tokens","primary_cat":"cs.CL","submitted_at":"2021-12-08T17:32:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RETRO matches GPT-3 and Jurassic-1 performance on the Pile benchmark using 25 times fewer parameters by conditioning on retrieved chunks from a 2-trillion-token database.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}