{"total":495,"items":[{"citing_arxiv_id":"2606.28057","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MultiHashFormer: Hash-based Generative Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-26T13:03:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiHashFormer enables hash-based autoregression in LMs by encoding tokens as multi-hash signatures, outperforming standard Transformers at 100M-3B scales while keeping parameter count constant for multilingual expansion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21255","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCOPE: Sequential Conformal Probing for Reliable OOD Rejection in LLM Services","primary_cat":"cs.CL","submitted_at":"2026-06-19T09:31:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCOPE selects readable hidden layers, constructs conformal gates with IND calibration, and uses supermartingale e-processes to certify persistent service-boundary evidence, improving rejection over final-layer detectors across multiple LLMs and boundary conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20369","ref_index":149,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CATCH-ME if you RAG: a dataset of Contextually Annotated multi-Turn Counterspeech against Hate and Misinformation Exchanges","primary_cat":"cs.CL","submitted_at":"2026-06-18T15:32:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Presents a new expert-curated dataset of multi-turn counterspeech dialogues in five languages targeting hate against seven groups, with span annotations linking to verified external knowledge for RAG applications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18056","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ConSA: Controllable Sparsity in Hybrid Attention via Learnable Allocation","primary_cat":"cs.CL","submitted_at":"2026-06-16T15:33:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ConSA learns FA/SWA allocation via L0 masks and augmented Lagrangian constraints, outperforming rule-based baselines on 0.6B and 1.7B models with consistent layer patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09508","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Rigid to Dynamic: Entropy-Guided Adaptive Inference for Long-Context LLMs","primary_cat":"cs.AI","submitted_at":"2026-06-08T14:02:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EntropyInfer adaptively allocates inference compute using per-head attention entropy for rigid/dynamic classification during prefilling and compresses KV cache with generated tokens, achieving up to 2.39x speedup on long contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08867","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Building Customer Support AI Agents at 100M-User Scale: An Evaluation-Driven Framework","primary_cat":"cs.CL","submitted_at":"2026-06-07T22:44:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"An evaluation-driven framework for customer support AI agents at Nubank integrates context engineering, LLM judges, and A/B testing to deliver up to 37pp NPS gains and strong offline-online correlation across five production domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07995","ref_index":84,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Customer-Agent: Overcoming Context Limitations in Ultra-Long Shopping Trajectories via Tool-Augmented Agents and RLVR","primary_cat":"cs.CL","submitted_at":"2026-06-06T06:22:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces ShopTrajQA long-context benchmark and an RLVR-trained tool-augmented agent that bypasses LLM context limits by external file storage and code-based retrieval for shopping trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06160","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Where does Absolute Position come from in decoder-only Transformers?","primary_cat":"cs.AI","submitted_at":"2026-06-04T13:32:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Absolute position information leaks into RoPE decoder-only transformers through the causal mask's position-dependent softmax denominator and the residual stream's closed dynamics at position 0 read by sink heads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24895","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hybrid Metadata Extraction from League of Nations Index Cards: From Feasibility Study to Archival System Integration","primary_cat":"cs.DL","submitted_at":"2026-06-04T10:03:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A hybrid AI workflow combining fine-tuned vision-language models with specialized OCR extracts metadata from League of Nations index cards for improved archival access in the LONTAD project.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01400","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consistent and Distinctive: LLM Benchmark Efficiency via Maximum Independent Set Prompt Selection on Similarity Graphs","primary_cat":"cs.CL","submitted_at":"2026-05-31T18:45:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A graph-based MIS prompt selection method on embedding similarity graphs yields reduced benchmark subsets with highly consistent LLM rankings (Kendall's W ≥ 0.90 in 99.2% of cases) and 25-48% size reduction at higher thresholds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01202","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Shape of Wisdom: Decision Trajectories in Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-31T12:33:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A 9,000-trajectory study across three LLMs finds correctness and stability differ, with the largest group unstable-correct and attention scalars aligning better than MLPs in stable cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00756","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoMIC: Collaborative Memory and Insights Circulation for Long-Horizon LLM Agents in Cloud-Edge Systems","primary_cat":"cs.AI","submitted_at":"2026-05-30T14:45:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoMIC is a parameter-free cloud-edge framework that circulates memory and insights between edge agents and a central critic to improve long-horizon LLM agent performance on symbolic and text tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00686","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dialectics of Alignment: Harnessing Unsafe Knowledge for Dynamic Safety Routing","primary_cat":"cs.LG","submitted_at":"2026-05-30T11:49:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SafeMoE isolates unsafe knowledge in domain-specific LoRA experts and routes them via a lightweight gate trained on safe responses to produce safer and more informative LLM outputs with zero-shot generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00620","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlowNar: Scalable Streaming Narration for Long-Form Videos","primary_cat":"cs.CV","submitted_at":"2026-05-30T08:51:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlowNar achieves bounded memory and 3x higher throughput for streaming narration on Ego4D, EgoExo4D, and EpicKitchens100 by combining dynamic historical context removal with a Cross Linear Attentive Memory module.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00570","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting Parameter-Based Knowledge Editing in Large Language Models: Theoretical Limits and Empirical Evidence","primary_cat":"cs.CL","submitted_at":"2026-05-30T06:44:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Parameter-based knowledge editing in LLMs induces reasoning collapse via dimensional collapse and is consistently outperformed by a retrieval baseline across varied edit counts, knowledge complexity, and evaluation metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00531","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"State Machine Guided Multi-Relational Synthetic Data from Logs for Anomaly Detection","primary_cat":"cs.MA","submitted_at":"2026-05-30T04:49:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A framework extracts a latent state machine from logs, induces a multi-table relational schema, and uses it as a generative prior to create synthetic data that augments real logs for better anomaly detection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.02628","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hallucination Is Linearly Decodable from Mid-Layer Hidden States in Quantized LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-30T02:04:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Linear probes on mid-layer hidden states in quantized LLMs detect hallucinations at 0.904-1.000 AUROC, exceeding sampling baselines and showing consistent layer bands across model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00467","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Limits of LLM Adaptability: Impact of Model-Internalized Priors on Annotation Task Performance","primary_cat":"cs.CL","submitted_at":"2026-05-30T01:21:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs correct only 34.8% of zero-shot annotation errors via prompting, and Definition-Specific Familiarity correlates positively with performance (partial r = +0.41) while memorization metrics do not.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00400","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Proxy-Mixing: Transferring Replay Controllers from Small to Large Models for Continual Instruction Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-29T22:32:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PROXYMIX learns a dynamic replay controller on a small proxy model and transfers it to a large target model, improving accuracy by 3.4 points and reducing forgetting by 3.5 points on LLaMA-3-8B continual tuning sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00359","ref_index":101,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Next-Billion AI Index: The compass for AI utility and adoption in the global majority","primary_cat":"cs.CY","submitted_at":"2026-05-29T21:01:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces nexbax, a diagnostic framework with three themes and 10 dimensions for evaluating AI economic viability, operational practicality, and societal integrity in next-billion-user contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31268","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mellum2 Technical Report","primary_cat":"cs.CL","submitted_at":"2026-05-29T13:01:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Mellum 2 is a 12B MoE model with 2.5B active parameters, trained on 10.6T tokens with MoE, GQA, SWA, and MTP, then post-trained into Instruct and Thinking variants, claimed competitive with 4B-14B models at 2.5B compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31100","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vector Linking via Cross-Model Local Isometric Consistency","primary_cat":"cs.AI","submitted_at":"2026-05-29T10:12:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A reference-based geometric hashing method recovers cross-model vector correspondences by exploiting local isometric consistency in contrastive embeddings and iteratively bootstrapping from a seed of paired anchors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07604","ref_index":164,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Contribution Weights: A Geometrical Analysis of Self-Attention Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-29T09:40:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contribution Weights combine attention, value magnitude, and directional alignment to measure token influence more faithfully than attention alone, and show attention sinks actively suppress information via a convex sink-rate to output-norm relationship.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30911","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Makes LVLMs Hallucinate Less? Unveiling the Architectural Factors Behind Hallucination Robustness","primary_cat":"cs.CV","submitted_at":"2026-05-29T06:47:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The study links three LVLM architectural dimensions to three hallucination types via a new benchmark, finding that language foundation quality reduces co-occurrence errors, visual encoder strength reduces similarity errors, alignment reduces uncertainty errors, and joint visual-alignment improvement","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":104,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30641","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"COFT: Counterfactual-Conformal Decoding for Fair Chain-of-Thought Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-28T22:52:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"COFT is a decoding technique that creates masked counterfactual prompts, fuses logits to attenuate bias, and applies dual-branch split-conformal calibration to certify fair token sets with marginal validity guarantees under exchangeability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30637","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EHRBench: An Automated and Reliable EHR-based Benchmark for Clinical Decision Making with LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-28T22:38:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EHRBench uses an EHR-LLM-KB pipeline to automatically create 960,067 reliable QA items spanning diagnosis, treatment, and prognosis for large-scale LLM evaluation in clinical decision making.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30571","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory-Bound but Not Bandwidth-Limited: The Physical AI Inference Gap in Batch-1 LLM Decode","primary_cat":"cs.AR","submitted_at":"2026-05-28T21:03:14+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Batch-1 autoregressive decode is memory-dominated yet launch overhead caps gains from higher-bandwidth GPUs, shown by measurements and CUDA Graphs ablation across four NVIDIA GPUs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30524","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Representation Collapse in Sequential Post-Training of Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-28T19:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sequential post-training of LLMs induces representation collapse that correlates with reduced plasticity, weaker generalization, and poorer calibration, with lightweight interventions tested to mitigate it.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29744","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Specialist Models Still Matter: A Heterogeneous Multi-Agent Paradigm for Medical Artificial Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-28T10:42:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"HetMedAgent is a heterogeneous multi-agent framework that fuses generalist LLMs and specialist models via conflict-aware fusion and uncertainty triggers, outperforming either alone on three clinical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29675","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Prompts to Context: An Ontology-Driven Framework for Human-Generative AI Collaboration","primary_cat":"cs.HC","submitted_at":"2026-05-28T09:35:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Presents the CCAI ontology and SPARQL retrieval method to convert ephemeral Human-Generative AI prompt interactions into explicit, machine-readable collaboration traces, illustrated in a competency-profile software case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29629","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Attack Success Rate: Temporal Logit Observability for LLM Safety Failures","primary_cat":"cs.AI","submitted_at":"2026-05-28T09:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TLO is a logit-based diagnostic that visualizes temporal patterns of LLM jailbreak failures on a calibrated 2D plane, distinguishing attacks with identical ASR and enabling early stopping that reduces successful jailbreaks by more than half.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29625","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Improving Collaborative Storytelling with a Multi-Agent Framework Based on Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-28T08:59:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An iterative writer-editor multi-agent LLM process improves perceived story quality in simulations of child collaborative storytelling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28740","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reverse Probing: Supervised Token-level Uncertainty Quantification for Large Language Models in Clinical Text","primary_cat":"cs.CL","submitted_at":"2026-05-27T17:01:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Reverse Probing extracts token-level uncertainty from LLM internal activations on labeled clinical summaries, outperforming eight baselines with up to 4x higher AUPRC on two expert-annotated datasets while lowering compute costs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28639","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Attentional White Bear Effect in Transformer Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-27T15:45:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prohibited concepts remain recoverable from hidden states, influence attention routing, and shape generations in transformers under instruction-based suppression.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28348","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Semantic-Agnostic and Shape-Aware Vision-Language Segmentation Models","primary_cat":"cs.CV","submitted_at":"2026-05-27T11:51:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces SANSA paradigm for semantic-agnostic vision-language segmentation via dictionary or example-based prompts, with finetuning delivering up to 20% mIoU gains on the new task while retaining standard performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28277","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do LLMs Build World Models From Text? A Multilingual Diagnostic of Spatial Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-27T10:20:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MentalMap benchmark identifies a universal L3 reasoning cliff in LLMs' text-based spatial reasoning that persists across languages, scales, and prompting, and is replicated in human evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28264","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Entropy Distribution as a Fingerprint for Hallucinations in Generative Models","primary_cat":"cs.AI","submitted_at":"2026-05-27T10:12:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Token entropy distributions fingerprint hallucinations in generative models, enabling the Calibrated Entropy Score (CES) for single-pass black-box detection with calibration guarantees via a novel DKW inequality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28890","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Echoes within the Reasoning: Stealthy and Effective Watermarking via Chain of Thought","primary_cat":"cs.CR","submitted_at":"2026-05-27T07:44:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BiCoT embeds watermarks into the internal geometry of Chain-of-Thought reasoning traces in LLMs via private signature subspace alignment and introduces Robust Subspace Registration for black-box verification under attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23465","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Iy\\`aw\\'oBench: A Benchmark for Evaluating Large Language Model Clinical Triage Accuracy on Undifferentiated Febrile Illness in Nigerian Primary Health Settings","primary_cat":"cs.CY","submitted_at":"2026-05-22T10:25:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IyàwóBench is the first benchmark for LLM clinical triage accuracy on undifferentiated febrile illness using 200 synthetic vignettes from Nigerian PHCs, with results showing 100% safety but accuracy from 39% to 70.5%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23315","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Convergence Without Understanding: When Language Models Agree on Representations but Disagree on Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-22T07:32:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Representational convergence across 16 LLMs on 800 reasoning problems is stronger for failed tasks and pre-decision stages but shows minimal causal influence on predictions, pointing to shared processing constraints over shared reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23244","ref_index":86,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Convex Optimization for Alignment and Preference Learning on a Single GPU","primary_cat":"cs.LG","submitted_at":"2026-05-22T05:25:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"COALA applies convex optimization reformulations of neural networks to direct preference optimization, claiming single-GPU training with ~18% of DPO's TFLOPs and competitive performance on multiple datasets and models up to 8B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22435","ref_index":138,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Assisted Counterspeech Writing at the Crossroads of Hate Speech and Misinformation","primary_cat":"cs.CL","submitted_at":"2026-05-21T13:02:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs generate adequate counterspeech for co-occurring hate and misinformation in 40% of cases, with a mixed knowledge strategy from fact-checkers and NGOs proving most effective after expert revision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22403","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Translating Signals to Languages for sEMG-Based Activity Recognition","primary_cat":"cs.CV","submitted_at":"2026-05-21T12:31:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM-sEMG maps sEMG signals to language via a dedicated mechanism to enable LLMs to perform accurate activity recognition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22170","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Factual Recall Mechanisms Carry over from Text to Speech in Multimodal Language Models?","primary_cat":"cs.CL","submitted_at":"2026-05-21T08:41:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Causal mediation analysis on SpiritLM reveals discrepancies in factual recall between text-to-text and speech-to-text paths, indicating only partial carry-over of mechanisms from text to speech modality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21883","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Token-weighted Direct Preference Optimization with Attention","primary_cat":"cs.CL","submitted_at":"2026-05-21T01:43:09+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21427","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PALS: Power-Aware LLM Serving for Mixture-of-Experts Models","primary_cat":"cs.AI","submitted_at":"2026-05-20T17:19:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PALS adds dynamic GPU power capping to LLM serving frameworks like vLLM, jointly tuning it with batch size via offline models and feedback control to improve energy efficiency up to 26.3% and cut QoS violations 4-7x on dense and MoE models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21299","ref_index":79,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tracing the ongoing emergence of human-like reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T15:28:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLMs function as accurate semantic processors for conditionals but do not replicate the pragmatic inferences that define human reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20683","ref_index":20,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Layer-wise Token Compression for Efficient Document Reranking","primary_cat":"cs.IR","submitted_at":"2026-05-20T03:52:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Layer-wise Token Compression applies adaptive token pooling at middle transformer layers for cross-encoder rerankers, preserving MS MARCO ranking quality while raising QPS up to 25% on passages and 116% on documents, with added gains on listwise LLM rerankers and a regularizer effect for long inputs","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20537","ref_index":87,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Do Biomedical NER and Entity Linking Benchmarks Measure? A Corpus-Centric Diagnostic Framework","primary_cat":"cs.CL","submitted_at":"2026-05-19T22:19:22+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A corpus-centric framework diagnoses scale, structure, overlap, metadata, and terminology properties across nine biomedical NER/EL corpora, showing substantial differences that common statistics fail to capture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}