{"total":145,"items":[{"citing_arxiv_id":"2606.01207","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Feature Alignment Determines Fusion Strategy: A Comparative Study of Cross-Attention and Concatenation in Multimodal Learning","primary_cat":"cs.CV","submitted_at":"2026-05-31T12:55:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Feature alignment quality determines whether concatenation or cross-attention excels for multimodal fusion, with concatenation winning on pre-aligned features due to lower sample complexity O(dv+dt) versus O(dv*dt).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00746","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Parallel Sequence Models to Foundation-Scale Vision Encoders","primary_cat":"cs.CV","submitted_at":"2026-05-30T14:29:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"C-GSPN scales 2D spatial propagation to foundation vision encoders via a fast CUDA kernel, compressed blocks, and two-stage distillation, matching ViT performance with 15% fewer parameters and 4x block speedup at 2K resolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28384","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Meta-Attention: Bayesian Per-Token Routing for Efficient Transformer Inference","primary_cat":"cs.LG","submitted_at":"2026-05-27T12:21:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Meta-Attention introduces per-token Bayesian routing among attention mechanisms via amortised variational inference with a Dirichlet prior, yielding lower projected FLOP cost than prior-free routing on a Tiny LM benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05208","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transformer-Enhanced Reinforcement Learning: Fundamentals and Applications in Communication Networks","primary_cat":"eess.SP","submitted_at":"2026-05-26T06:44:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":1.0,"formal_verification":"none","one_line_summary":"A survey of Transformer-enhanced reinforcement learning fundamentals and applications in communication networks covering resource allocation, computation offloading, routing, trajectory control, and security.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23751","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Approaching I/O-optimality for Approximate Attention","primary_cat":"cs.LG","submitted_at":"2026-05-22T15:23:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents I/O-efficient algorithms for approximate attention with almost-linear cost in n, approaching lower bounds in most parameter regimes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22476","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Structured-Sparse Attention for Entity Tracking with Subquadratic Sequence Complexity","primary_cat":"cs.LG","submitted_at":"2026-05-21T13:35:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Derives a blockwise resolvent-style attention operator that exploits structured sparsity for subquadratic O(n^{4/3}d) entity tracking while matching dense accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21081","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Musical Attention Transformer: Music Generation Using a Music-Specific Attention Model","primary_cat":"cs.SD","submitted_at":"2026-05-20T12:16:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces Musical Attention, an attention variant that incorporates eight musical features including metadata to generate more coherent and varied music than standard or strided attention baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21042","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Video Generation: Shaping Video Generation Across Time and Space","primary_cat":"cs.CV","submitted_at":"2026-05-20T11:24:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DVG dynamically selects content-aware spatio-temporal acceleration strategies for diffusion-based video generation, delivering up to 7x speedup with near-lossless quality on models like HunyuanVideo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20839","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Activation-Free Backbones for Image Recognition: Polynomial Alternatives within MetaFormer-Style Vision Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T07:29:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Polynomial replacements for activations in MLPs, convolutions, and attention within MetaFormer yield PolyNeXt models that match or exceed standard performance on ImageNet, ADE20K, and robustness benchmarks while beating prior polynomial networks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20813","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PulseCol: Periodically Refreshed Column-Sparse Attention for Accelerating Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-20T07:06:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PulseCol introduces periodically refreshed column-sparse attention to achieve up to 1.95x speedup over FlashAttention in diffusion LLMs with maintained model quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20724","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CALMem : Application-Layer Dual Memory for Conversational AI","primary_cat":"cs.IR","submitted_at":"2026-05-20T05:23:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CALMem delivers virtually unbounded effective context for LLM conversations via an application-layer dual memory architecture with intra-session retrieval and token-adaptive injection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17270","ref_index":91,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond Detection: A Structure-Aware Framework for Scene Text Tracking","primary_cat":"cs.CV","submitted_at":"2026-05-17T05:40:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SymTrack is the first systematic detection-free framework for scene text tracking that constructs benchmarks from video text spotting datasets and reports up to 11.97% AUC gains over prior trackers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15413","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transformer Scalability Crisis: The First Comprehensive Empirical Analysis of Performance Walls in Modern Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-14T20:57:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Empirical tests on 118 transformers show success falling from 88.1% at 512 tokens to 0% at 2048 tokens, with compressed models achieving 649.2 tokens/sec/M parameters versus 12.5 for large generative ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15305","ref_index":90,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WorldParticle: Unified World Simulation of Lagrangian Particle Dynamics via Transformer","primary_cat":"cs.GR","submitted_at":"2026-05-14T18:18:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A transformer with prediction-correction and hierarchical super-token merging unifies simulation of six physical dynamics categories on Lagrangian particles and generalizes to unseen conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14589","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EndPrompt: Efficient Long-Context Extension via Terminal Anchoring","primary_cat":"cs.CL","submitted_at":"2026-05-14T09:00:03+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13833","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QLAM: A Quantum Long-Attention Memory Approach to Long-Sequence Token Modeling","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:56:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QLAM extends state-space models with quantum superposition in the hidden state for linear-time long-sequence modeling and reports consistent gains over RNN and transformer baselines on sequential image tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13784","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention Once Is All You Need: Efficient Streaming Inference with Stateful Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-13T17:06:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stateful sessions with incremental KV cache and flash queries allow O(|q|) latency in streaming transformer inference, delivering up to 5.9x speedup over conventional engines while preserving full attention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13370","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Phasor Memory Networks: Stable Backpropagation Through Time for Scalable Explicit Memory","primary_cat":"cs.LG","submitted_at":"2026-05-13T11:28:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PMNet uses unitary phasor dynamics and hierarchical anchors to make explicit memory stable for long sequences, matching a 3x larger Mamba model on long-context robustness with a 119M parameter network.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12471","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KV-Fold: One-Step KV-Cache Recurrence for Long-Context Inference","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:53:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KV-Fold turns frozen transformers into stable long-context models by folding the KV cache across sequence chunks in repeated forward passes.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Production systems such as vLLM [ 7] chunk long inputs to manage memory during prefill, optimizing for throughput rather than characterizing cross-chunk recurrence behavior. ReAttention [27] similarly operates at inference time but caps attention to a finite scope. Sparse attention and kernel optimizations.Sparse attention patterns (Longformer [ 29], Big- Bird [30], Sparse Transformers [31]) and exact kernel optimizations (FlashAttention [4, 5], RingAt- tention [6]) reduce per-step cost without changing how information propagates across long sequences. They are orthogonal to KV-Fold and could compose with it. Linear-attention alternatives.Linear Attention [ 32] and Performers [33] compress past context into a fixed-size state, sacrificing content-based addressability for asymptotic efficiency - the opposite"},{"citing_arxiv_id":"2605.12464","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Search Your Block Floating Point Scales!","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:50:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ScaleSearch optimizes block floating point scales via fine-grained search to cut quantization error by 27% for NVFP4, improving PTQ by up to 15 points on MATH500 for Qwen3-8B and attention PPL by 0.77 on Llama 3.1 70B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12193","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BFLA: Block-Filtered Long-Context Attention Mechanism","primary_cat":"eess.SP","submitted_at":"2026-05-12T14:36:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"BFLA is a two-stage block-filtered sparse prefill attention mechanism that constructs an input-dependent block mask and applies tile-level rescues to skip unimportant KV tiles while preserving exact attention inside retained tiles, delivering speedups on models like Llama 3.1 with minimal accuracy 0","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11274","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"End-to-End Population Inference from Gravitational-Wave Strain using Transformers","primary_cat":"gr-qc","submitted_at":"2026-05-11T21:54:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Dingo-Pop uses a transformer to perform amortized, end-to-end population inference from GW strain data in seconds, bypassing per-event Monte Carlo sampling.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Katharopoulos, A. Vyas, N. Pappas, and F. Fleuret, Transformers are rnns: Fast autoregressive transform- ers with linear attention, inInternational conference on machine learning(PMLR, 2020) pp. 5156-5165. [47] N. Kitaev, L. Kaiser, and A. Levskaya, Reformer: The ef- ficient transformer, inInternational Conference on Learn- ing Representations(2020). [48] R. Child, S. Gray, A. Radford, and I. Sutskever, Gen- erating long sequences with sparse transformers (2019), arXiv:1904.10509 [cs.LG]. [49] I. Beltagy, M. E. Peters, and A. Cohan, Long- former: The long-document transformer, arXiv preprint arXiv:2004.05150 (2020). [50] A. Wehenkel, M. Kagan, L. Heinrich, and C. Pollard, It just takes two: Scaling amortized inference to large sets"},{"citing_arxiv_id":"2605.10875","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Compute Where it Counts: Self Optimizing Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:27:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOL trains a policy to dynamically control multiple efficiency mechanisms per token via group-relative policy optimization on teacher-forced episodes, yielding better quality at matched average budget than static or random allocation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Compute Where it Counts: Self Optimizing Language Models B. Model Configurations We primarily train and evaluate six main models, whose configurations are described below. B.1. Search Space Size Ablation Models Shared configuration (used unless overridden) Base LLM:meta-llama/Llama-3.2-1B Token Sparsity Implementation:Quest (page size= 4) Budget Ranges:Token budget[0.1,1.0]; pruning budget[0.4,1.0]; quantization ratio budget[0.3125,1.0] Budget Penalty Weights:α κ = 100;α ρ = 100;α η = 200 GRPO group size / horizon:T=16decode steps/episode;K=16schedules/input; entropy coef= 0.05 Optimization:Batch size= 8; grad accumulation= 8; lr= 10 −4; max grad norm= 2.0; epochs= 1 Context Length:1024 Policy Network (Controller):Transformer: dmodel = 512; nheads = 4; nlayers = 1; MLP ratio = 4."},{"citing_arxiv_id":"2605.09932","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FocuSFT: Bilevel Optimization for Dilution-Aware Long-Context Fine-Tuning","primary_cat":"cs.CL","submitted_at":"2026-05-11T03:30:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FocuSFT uses an inner optimization loop to adapt fast-weight parameters into a parametric memory that sharpens attention on relevant content, then conditions outer-loop supervised fine-tuning on this representation, yielding gains on long-context benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"beginning and end of the context [21, 16], and attention sinks consume a large share of the budget on a handful of initial tokens [36]. We refer to the resulting starvation of content tokens asattention dilution(formalized in Section 2.1). Existing remedies overwhelmingly target inference (positional calibration [16], dynamic scaling [39], test-time training [4, 32]) or require pretraining from scratch [8, 38]. What remains largely unexplored is whether the fine-tuning procedure itself contributes to this gap. We present evidence that it does: during standard SFT on a long sequence, the same biases and sink patterns govern the forward pass that produces the training loss. The gradient signal is computed from representations where most attention goes to positionally privileged tokens rather than content."},{"citing_arxiv_id":"2605.08966","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VORT: Adaptive Power-Law Memory for NLP Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-09T14:20:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VORT assigns learnable fractional orders to tokens and approximates their power-law retention kernels via sum-of-exponentials for efficient long-range dependency modeling in transformers.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"LG] 9 May 2026 1 Introduction The Transformer [43] underlies modern large language models [2, 6, 9, 42], vision systems [35], and scientific applications [21]. Its core operation is Attn(Q,K,V) = softmax ( QK⊤ √dk ) V, Q,K∈R n×dk, V∈Rn×dv,(1) whosequadraticcostinsequencelength nhasdrivenalargebodyofefficientapproximations including sparse attention [7], local-window methods [3, 49], low-rank factorisation [44], and kernel-feature approximations [8, 22]. A statistical mismatch underlies these computational challenges. Mutual information between tokens at lagℓdecays as a power lawI(ℓ)∼cℓ−γwith γ∈(0, 1)[ 1, 10]-a hallmark of long-range dependence [4, 14, 19]. For ARFIMA(0,d, 0)processes, whose spectral density satisfiesS(λ)∼c0|λ|−2dnear λ= 0, the optimal linear predictor assigns"},{"citing_arxiv_id":"2605.08505","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Limits of Long-Context Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-08T21:39:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"For uniform keys on the d-dimensional sphere, softmax attention becomes selective at inverse temperature scaling β_n* ≍ n^{2/(d-1)}, with explicit limiting laws for attention weights and outputs in each regime.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07959","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Convergent Stochastic Training of Attention and Understanding LoRA","primary_cat":"cs.LG","submitted_at":"2026-05-08T16:22:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Attention and LoRA regression losses induce Poincaré inequalities under mild regularization, so SGD-mimicking SDEs converge to minimizers with no assumptions on data or model size.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"parameters are set to their Phase 1 values. ThenWWW Q andWWW K arere-initialisedidentically across all three runs to their initial values used in Phase 1. Three objectives are then compared over 100 additional epochs, (1)Unregularised:(i.e.,nonein plots) L0= ˆR∗(TTT),(6.3) (2)Log-amplified norm penalty:(i.e.,login plots) Llog= ˆR∗(TTT)+ λ 2 Slog(1+S),λ>0,S=∥W WW Q∥2 F+∥WWW K∥2 F .(6.4) (3)Super-quadratic norm penalty:(i.e.,powerin plots) L2+ε = ˆR∗(TTT)+ λ 2(∥WWW Q∥2+ε F +∥WWW K∥2+ε F ) ,λ,ε>0,(6.5) whereTTT=(WWW Q,WWW K)and ˆR∗(TTT)denotes the MSE loss with embedding layers,WWWV and output MLP frozen at their optimal values. In all Phase 2 runs we use Adam withη=10 −3, batch size 32 (as in Phase-1), and hyper-parametersε=10 −6, λ=10 −5forlogandλ=10 −4forpower."},{"citing_arxiv_id":"2605.07363","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MISA: Mixture of Indexer Sparse Attention for Long-Context LLM Inference","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:19:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MISA routes to a small subset of indexer heads via block statistics, matching full DSA performance on LongBench with 4-8x fewer heads and 3.82x speedup while recovering over 92% of selected tokens.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Together, these results show that head-level routing is a practical efficiency axis for fine-grained sparse attention, on top of any existing token-level scheme. 2 Related work Sparse attention.A long line of work attacks the quadratic cost of attention on long contexts by selecting a subset of past tokens for each query.Static-patternmethods such as Sparse Transformer [5], Longformer [3], and BigBird [27] use predefined window, stride, and global tokens that are decoupled from the actual content.Cache-evictionmethods drop tokens at decode time using attention-statistics heuristics: StreamingLLM [23] keeps a few attention sinks plus a recent window, H2O [30] retains heavy hitters in past attention, and SnapKV [ 16] clusters and compresses the KV cache."},{"citing_arxiv_id":"2605.05549","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Novel Graph-Regulated Disentangling Mamba Model with Sparse Tokens for Enhanced Tree Species Classification from MODIS Time Series","primary_cat":"cs.CV","submitted_at":"2026-05-07T00:53:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A graph-regulated disentangling Mamba model with sparse tokens achieves 93.94% accuracy classifying tree species from MODIS time series in Alberta and outperforms twelve prior models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05066","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Impossibility Triangle of Long-Context Modeling","primary_cat":"cs.CL","submitted_at":"2026-05-06T16:01:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"No model can achieve efficiency, compactness, and recall capacity scaling with sequence length at once, as any two imply a strict bound of O(poly(d)/log V) on recallable facts.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"satisfies all three of the following: (i) EfficiencyE: Cost(δ(s t−1, xt))≤p(d)for a polynomialpindependent ofT; (ii) CompactnessC:|s t|bits ≤q(d)for a polynomialqindependent ofT; (iii) Strong RecallR:there existγ >0andε∈(0,1−1/V)such thatR(1−ε, γT)holds for all sufficiently largeT. Moreover, anyPsatisfying (i) and (ii) can recall at most n∗ ≤ q(d) (1−ε) log 2 V−1 (6) key-value pairs at accuracy1−ε. Sinceq(d)is independent ofT, this impliesn ∗ = O(poly(d)/logV) =o(T)asT→ ∞, contradicting (iii). 4.2 Proof The proof has three steps. Step 1 bounds the information that the state can carry about the input. Step 2 bounds the information that successful recall demands. Step 3 combines the two bounds. We first recall two classical results."},{"citing_arxiv_id":"2605.04901","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the (In-)Security of the Shuffling Defense in the Transformer Secure Inference","primary_cat":"cs.CR","submitted_at":"2026-05-06T13:31:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An attack aligns differently shuffled intermediate activations from secure Transformer inference queries to recover model weights with low error using roughly one dollar of queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03644","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AdapShot: Adaptive Many-Shot In-Context Learning with Semantic-Aware KV Cache Reuse","primary_cat":"cs.AI","submitted_at":"2026-05-05T11:16:52+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02568","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StreamIndex: Memory-Bounded Compressed Sparse Attention via Streaming Top-k","primary_cat":"cs.LG","submitted_at":"2026-05-04T13:19:29+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Chunked streaming top-k enables CSA indexer execution at 1M sequence length with 6.21 GB peak memory and >=0.998 recall on synthetic V4-shaped inputs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This empirical guarantee matches the practical use of the indexer, which feeds the index set to the attention kernel where order is irrelevant; promoting it to a derived corollary would require inlining a deterministic comparator in both paths, which neither the materialize reference nor our implementation does. Corollary (chunked partition-merge).For any partition[T legal] =P 1 ⊔P 2 ⊔ · · · ⊔P n, argtopk ≻g= topk ≻ \u0010 n[ i=1 argtopmin(k,|Pi|) ≻ g|Pi \u0011 . This is what enables chunked execution: process partitions independently, take per-partition top-k, merge. 5 The Chunked Indexer STREAMINDEXinstantiates the corollary of §4 as a Python driver over Triton primitives. Figure 1 shows the data flow. Fused score kernel ( indexer_score.py, 198 lines).A single autotuned Triton kernel computes scoreb,t,s =P h wb,t,h ·ReLU(q b,t,h,d ·K C b,s,d) on a [cS, cT ] tile. The tensor of head-wise scores is reduced to [cS, cT ] FP32 inside the kernel before any global memory write; we never materialize theH I-axis intermediate. This is the unit of work the chunked driver issues per tile. Chunked driver ( chunked_indexer.py, 151 lines).The driver is the body of Algorithm 2. It maintains a running top-k buffer per query, indexed by the outerS-tile."},{"citing_arxiv_id":"2605.02152","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpecEdit: Training-Free Acceleration for Diffusion based Image Editing via Semantic Locking","primary_cat":"cs.CV","submitted_at":"2026-05-04T02:30:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SpecEdit accelerates diffusion-based image editing up to 10x by using a low-resolution draft to identify edit-relevant tokens via semantic discrepancies for selective high-resolution denoising.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01910","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Stochastic Sparse Attention for Memory-Bound Inference","primary_cat":"cs.LG","submitted_at":"2026-05-03T14:44:14+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01711","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Linear-Time Global Visual Modeling without Explicit Attention","primary_cat":"cs.CV","submitted_at":"2026-05-03T04:51:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dynamic parameterization of standard layers can replace explicit attention for linear-time global visual modeling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00768","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Characterizing the Expressivity of Local Attention in Transformers","primary_cat":"cs.CL","submitted_at":"2026-05-01T16:30:52+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00061","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniBCI: Towards a Unified Pretrained Model for Invasive Brain-Computer Interfaces","primary_cat":"cs.NE","submitted_at":"2026-04-30T06:30:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniBCI is a unified pretrained model for invasive neural spike data that uses CST tokenization, IAA attention, and self-supervised masked reconstruction to achieve SOTA downstream performance with better generalization and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27124","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Better Models, Faster Training: Sigmoid Attention for single-cell Foundation Models","primary_cat":"cs.LG","submitted_at":"2026-04-29T19:23:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Sigmoid attention replaces softmax in single-cell foundation models to deliver better representations, faster training, and stability, backed by bounded derivatives, diagonal Jacobian, and a new efficient GPU kernel.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26837","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unifying Sparse Attention with Hierarchical Memory for Scalable Long-Context LLM Serving","primary_cat":"cs.LG","submitted_at":"2026-04-29T16:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPIN co-designs sparse attention with hierarchical memory to achieve 1.66-5.66x higher throughput, 7-9x lower TTFT, and up to 58% lower TPOT than vLLM and original sparse implementations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"token. Consequently, serving is jointly constrained by the memory bandwidth to sustain linear KV accesses and GPU memory capacity to hold expanding KV caches. Dynamic sparse attention offers a promising algorithmic response. Prior work shows that, for a given query, only a small subset of historical tokens typically dominates the next-token prediction [15, 16, 18]. Recent sparse methods therefore preserve the full KV cache but select only the criti- cal subset at each decoding step[14, 34, 56, 71], reducing the amount of KV data that attention must process per step. The selective access pattern of sparse attention breaks the full-KV dependency assumed by prior dense-serving systems [25, 77] and makes hierarchical KV storage attractive."},{"citing_arxiv_id":"2604.24432","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Kwai Summary Attention Technical Report","primary_cat":"cs.CL","submitted_at":"2026-04-27T12:59:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kwai Summary Attention compresses historical contexts into learnable summary tokens to reduce sequence modeling cost to O(n/k) while preserving linear KV cache and long-range dependencies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24037","ref_index":98,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Limit Theory of Foundation Models: A Mathematical Approach to Understanding Emergent Intelligence and Scaling Laws","primary_cat":"cs.LG","submitted_at":"2026-04-27T04:43:42+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Transformer architecture (see Example 6) with 12 layers; (b) GPT-2 model [ 97] is stacked by revised Transformer architecture (see Example 7) with 48 layers; (c) GPT-3 model [5] use the same model and architecture as GPT-2, with the exception that we use alternating dense and locally banded sparse attention patterns in the layers of the transformer, similar to the Sparse Transformer [98]. fW ,D,Adenote the network architecture, training dataset and learning algorithm of the foundation models, respectively. Table 1 summarizes the main notations used in this paper. Figure 1 shows the model architecture of the GPT-family [ 5, 96, 97], which implies that a large-scale foundation model is always equipped with periodic structural func-"},{"citing_arxiv_id":"2604.22583","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Head Budgeting for Efficient Multi-Head Attention","primary_cat":"cs.LG","submitted_at":"2026-04-24T14:15:22+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22442","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HubRouter: A Pluggable Sub-Quadratic Routing Primitive for Hybrid Sequence Models","primary_cat":"cs.LG","submitted_at":"2026-04-24T10:59:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HubRouter is a sub-quadratic routing primitive using learned hubs that replaces attention layers in hybrid models while delivering competitive perplexity and large throughput gains.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"protocol differences; the comparison above is internally consistent but not directly comparable to literature. Distillation-based transplantation (e.g., blockwise local distillation, attention-derived initialization, gated layer-by-layer replacement) is a plausible route to closing this gap and is left to follow-up work.) 8 Related Work Efficient attention - sparsity and approximation.Sparse Transformers [10], Longformer [11], and BigBird [12] reduce attention's quadratic cost through fixed sparsity patterns. Performer [13] and linear attention [14] approximate the softmax kernel; Linformer [21] projects keys/values to a lower-rank subspace; Nystr¨ omformer [22] uses landmark points to approximate the full attention matrix. Reformer [23] uses LSH to select content-relevant tokens; Routing Transformer [24] uses online clustering for content-based attention."},{"citing_arxiv_id":"2604.21816","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tool Attention Is All You Need: Dynamic Tool Gating and Lazy Schema Loading for Eliminating the MCP/Tools Tax in Scalable Agentic Workflows","primary_cat":"cs.AI","submitted_at":"2026-04-23T16:10:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Tool Attention cuts tool-related tokens by 95% and raises context utilization from 24% to 91% in a 120-tool simulation via dynamic gating and lazy loading.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20789","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Working Memory Constraints Scaffold Learning in Transformers under Data Scarcity","primary_cat":"cs.CL","submitted_at":"2026-04-22T17:14:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fixed-width and decay-based attention mechanisms inspired by working memory improve Transformer grammatical accuracy and human alignment under limited training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20595","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"An explicit operator explains end-to-end computation in the modern neural networks used for sequence and language modeling","primary_cat":"cs.NE","submitted_at":"2026-04-22T14:11:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"S4D state space models correspond exactly to wave propagation and nonlinear wave interactions in a one-dimensional ring oscillator network, with a closed-form operator describing the complete input-output map.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20920","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Simplified Sparse Attention via Gist Tokens","primary_cat":"cs.LG","submitted_at":"2026-04-22T04:22:32+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20915","ref_index":60,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Absorber LLM: Harnessing Causal Synchronization for Test-Time Training","primary_cat":"cs.LG","submitted_at":"2026-04-22T02:58:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Absorber LLM introduces causal synchronization to absorb context into parameters for memory-efficient long-context LLM inference while preserving causal effects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19351","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DASH-KV: Accelerating Long-Context LLM Inference via Asymmetric KV Cache Hashing","primary_cat":"cs.CL","submitted_at":"2026-04-21T11:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASH-KV accelerates long-context LLM inference to linear complexity via asymmetric KV cache hashing and mixed-precision retention, matching full attention performance on LongBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}