{"total":55,"items":[{"citing_arxiv_id":"2606.09508","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Rigid to Dynamic: Entropy-Guided Adaptive Inference for Long-Context LLMs","primary_cat":"cs.AI","submitted_at":"2026-06-08T14:02:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EntropyInfer adaptively allocates inference compute using per-head attention entropy for rigid/dynamic classification during prefilling and compresses KV cache with generated tokens, achieving up to 2.39x speedup on long contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28148","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeltaMCP: Incremental Regeneration via Spec-Aware Transformation for MCP servers","primary_cat":"cs.SE","submitted_at":"2026-05-27T08:31:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DeltaMCP introduces specification-aware incremental regeneration to keep MCP servers synchronized with evolving OpenAPI specifications.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23764","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HyperParallel-MoE: Multi-Core Interleaved Scheduling for Fast MoE Training on Ascend NPUs","primary_cat":"cs.DC","submitted_at":"2026-05-22T15:35:23+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23200","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adaptive Mass-Segmented KV Compression for Long-Context Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-22T03:32:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AMS KV compression adaptively partitions the cache by attention mass regions and assigns quotas to protect contiguous reasoning blocks during long-context LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22416","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Asymmetric Virtual Memory Paging for Hybrid Mamba-Transformer Inference","primary_cat":"cs.LG","submitted_at":"2026-05-21T12:37:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AVMP separates KV and SSM cache pools behind unified virtual addressing with failure-triggered migration, cutting OOM events 7.6% and raising throughput 1.83-13.3x on synthetic loads and 2.36x on ShareGPT traces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21070","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Towards Understanding Self-Pretraining for Sequence Classification","primary_cat":"cs.LG","submitted_at":"2026-05-20T11:56:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Self-pretraining improves Transformer sequence classification by enabling learning of proximity-biased attention from positional encodings that label supervision alone cannot easily acquire from random starts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20706","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Llamas on the Web: Memory-Efficient, Performance-Portable, and Multi-Precision LLM Inference with WebGPU","primary_cat":"cs.DC","submitted_at":"2026-05-20T05:05:10+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LlamaWeb is a WebGPU backend for llama.cpp that uses static memory planning, tunable kernels, and templated multi-precision support to cut memory use by 29-33% and raise decode throughput by 45-69% versus prior browser frameworks on tested hardware.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22856","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PilotWiMAE: Pilot-Native Representation Learning for Wireless Channels","primary_cat":"eess.SP","submitted_at":"2026-05-19T06:21:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PilotWiMAE pretrains an encoder on noisy pilots with factorized attention, 99% masking, patch-normalized reconstruction, scale loss, and AWGN curriculum to outperform supervised baselines in cross-frequency beam selection and channel tasks from 3.5 GHz pretraining to 28 GHz evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18856","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPHERICAL KV: Angle-Domain Attention and Rate-Distortion Retention for Efficient Long-Context Inference","primary_cat":"cs.LG","submitted_at":"2026-05-13T18:48:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11733","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: LLM Inference Should Be Evaluated as Energy-to-Token Production","primary_cat":"cs.CE","submitted_at":"2026-05-12T08:15:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM inference should be reframed and evaluated as energy-to-token production with a Token Production Function that accounts for power, cooling, and efficiency ceilings.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Symposium on Operating Systems Principles (SOSP), pages 611-626, 2023. URL https: //arxiv.org/abs/2309.06180. [37] E. Frantar, S. Ashkboos, T. Hoefler, and D. Alistarh. Gptq: Accurate post-training quantization for generative pre-trained transformers. InProceedings of the 11th International Conference on Learning Representations, 2023. URLhttps://arxiv.org/abs/2210.17323. [38] J. Lin, J. Tang, H. Tang, S. Yang, X. Dang, and S. Han. Awq: Activation-aware weight quantization for on-device llm compression and acceleration. InProceedings of Machine Learning and Systems, volume 6, 2024. URLhttps://arxiv.org/abs/2306.00978. [39] Longteng Zhang, Xiang Liu, Zeyu Li, Xinglin Pan, Peijie Dong, Ruibo Fan, Rui Guo, Xin Wang, Qiong Luo, Shaohuai Shi, and Xiaowen Chu."},{"citing_arxiv_id":"2605.10414","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Remember to Forget: Gated Adaptive Positional Encoding","primary_cat":"cs.LG","submitted_at":"2026-05-11T11:52:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GAPE augments RoPE with query- and key-dependent gates to stabilize attention and improve long-context performance in language models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"RoPE decomposes queries and keys into two-dimensional chunks, rotating each at a different frequencyg k ∈G, ranging fromg 1 = 1radian per token (highest frequency) tog d/2 ≈1/θradians per token (lowest), whereθis the base wavelength, defaulting to10,000[24]. Long-context extrapolation and the base-scaling deadlock.A natural response to RoPE's ex- trapolation failures is to scale θ. Position Interpolation [2], YaRN [19], and LongRoPE [8] remap rotary frequencies to reduce OOD phase angles at extended lengths. However, recent theoretical analyzes reveal that this exposes aninterpolation-extrapolation deadlock[13, 15, 29]: shrinking θ smooths extrapolation but harms long-range semantic discrimination, while inflating θ preserves local interpolation but devolves low-frequency channels into near-identity maps, ultimately colliding"},{"citing_arxiv_id":"2605.09735","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"KV-RM: Regularizing KV-Cache Movement for Static-Graph LLM Serving","primary_cat":"cs.AR","submitted_at":"2026-05-10T20:10:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"KV-RM regularizes KV-cache movement in static-graph LLM serving via block paging and merge-staged transport to improve throughput, tail latency, and memory use for variable-length decoding.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"(NeurIPS), Vol. 35. Curran Associates, Inc., Red Hook, NY, USA, 16344- 16359.https://arxiv.org/abs/2205.14135 [9] Pradeep Dasigi, Kyle Lo, Iz Beltagy, Arman Cohan, Noah A. Smith, and Matt Gardner. 2021. A Dataset of Information-Seeking Ques- tions and Answers Anchored in Research Papers. arXiv preprint arXiv:2105.03011.https://arxiv.org/abs/2105.03011 [10] Hao Geng, Phitchaya Mangpo Phothilimthana, et al. 2024. vAttention: Dynamic Memory Management for Serving LLMs without PagedAt- tention. arXiv preprint arXiv:2405.04437.https://arxiv.org/abs/2405.0 4437 12 [11] Graphcore. 2023. Poplar SDK Documentation.https://docs.graphcore .ai/. [12] Graphcore Research. 2024. SparQ Attention: Speed up LLM inference"},{"citing_arxiv_id":"2605.08587","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Kaczmarz Linear Attention","primary_cat":"cs.LG","submitted_at":"2026-05-09T01:07:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Kaczmarz Linear Attention replaces the empirical coefficient in Gated DeltaNet with a key-norm-normalized step size derived from the online regression objective, yielding lower perplexity and better needle-in-haystack performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[7] Tri Dao and Albert Gu. Transformers are ssms: Generalized models and efficient algorithms through structured state space duality, 2024. URLhttps://arxiv.org/abs/2405.21060. [8] Tri Dao, Daniel Y . Fu, Stefano Ermon, Atri Rudra, and Christopher Ré. Flashattention: Fast and memory-efficient exact attention with io-awareness, 2022. URL https://arxiv.org/ abs/2205.14135. [9] Soham De, Samuel L. Smith, Anushan Fernando, Aleksandar Botev, George Cristian-Muraru, Albert Gu, Ruba Haroun, Leonard Berrada, Yutian Chen, Srivatsan Srinivasan, Guillaume Desjardins, Arnaud Doucet, David Budden, Yee Whye Teh, Razvan Pascanu, Nando De Freitas, and Caglar Gulcehre. Griffin: Mixing gated linear recurrences with local attention for efficient"},{"citing_arxiv_id":"2605.08467","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CUDAHercules: Benchmarking Hardware-Aware Expert-level CUDA Optimization for LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-08T20:35:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CUDAHercules benchmark demonstrates that leading LLMs generate functional CUDA code but fail to recover expert-level optimization strategies needed for peak performance on Ampere, Hopper, and Blackwell GPUs.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"at performance diagnosis, but have limited ability in architecture-specific reasoning, hardware- software co-design, and robust debugging or deployment security. across GPU generations. Even within a single workload family, the top panel of Figure 1 shows that the throughput of FlashAttention variants changes substantially across FlashAttention V1 [6] to FlashAttention V2 [5] and FlashAttention V3 [20] on different GPU architectures. The bottom panel illustrates why: the optimization strategies of FlashAttention move from tiled shared-memory reuse on Ampere to asynchronous pipelines on Hopper and warp-specialized execution on Blackwell, with each variant designed for a specific hardware architecture."},{"citing_arxiv_id":"2605.07985","ref_index":15,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dooly: Configuration-Agnostic, Redundancy-Aware Profiling for LLM Inference Simulation","primary_cat":"cs.DC","submitted_at":"2026-05-08T16:44:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dooly reduces LLM inference profiling GPU-hours by 56.4% across 12 models while keeping simulation MAPE under 5% for TTFT and 8% for TPOT by making profiling configuration-agnostic and redundancy-aware.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"signature or a new fused operation) invalidates existing profiles. Expanding coverage requires manual effort:e.g.,patching Vidur to support vLLM required understanding its parallelism initialization logic, integrating its custom fused kernels, and rewriting module callables to match Vidur's expected structure. The unsustainability of this approach is reflected in GitHub issues for Vidur [15, 33, 32], LLMServingSim [18], and Apex [20]. High profiling overhead.Despite their restricted configuration scope, today's simulators incur substantial profiling overhead. To construct an accurate latency model, each model requires an exhaustive sweep over a grid of batch sizes, sequence lengths, and context lengths up to the model's maximum context window, repeated for every new model or backend."},{"citing_arxiv_id":"2605.05049","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Piper: Efficient Large-Scale MoE Training via Resource Modeling and Pipelined Hybrid Parallelism","primary_cat":"cs.DC","submitted_at":"2026-05-06T15:47:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Piper introduces resource modeling and pipelined hybrid parallelism for MoE training, delivering 2-3.5X higher MFU than prior frameworks and 1.2-9X better all-to-all bandwidth.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04489","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Hybrid Method for Low-Resource Named Entity Recognition","primary_cat":"cs.CE","submitted_at":"2026-05-06T04:36:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The hybrid method with LLM-augmented data achieves F1 improvements of 7-24 points over baselines on five Vietnamese domain datasets.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Another breakthrough comes in the form of FlashAttention, which enhances the efficiency of the attention mechanism. By optimizing memory access patterns and minimizing memory reads and writes between GPU high -bandwidth memory and on-chip SRAM, FlashAttention enables faster and more memory-efficient exact attention computations. Tri Dao et al have shown that this technique yields significant speed improvements across both training and inference tasks[25]. Additionally, ONNX Runtime offers a cross-platform framework with built-in optimizations such as operator fusion and quantization, enabling faster Transformer inference. Combined with KV caching and FlashAttention, it contributes to a powerful toolkit for deploying large language models in latency-sensitive applications. 3. Methodology As mentioned above, LLMs are not effective for domain specific NER problems (expensive cost and inference time)."},{"citing_arxiv_id":"2605.03208","ref_index":4,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Kerncap: Automated Kernel Extraction and Isolation for AMD GPUs","primary_cat":"cs.SE","submitted_at":"2026-05-04T22:53:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Kerncap automates extraction of faithful, self-contained GPU kernel reproducers from AMD HIP and Triton workloads via HSA interception and address-space closure, delivering 13.6x faster isolated tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02568","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StreamIndex: Memory-Bounded Compressed Sparse Attention via Streaming Top-k","primary_cat":"cs.LG","submitted_at":"2026-05-04T13:19:29+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Chunked streaming top-k enables CSA indexer execution at 1M sequence length with 6.21 GB peak memory and >=0.998 recall on synthetic V4-shaped inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02144","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Projection-Free Transformers via Gaussian Kernel Attention","primary_cat":"cs.LG","submitted_at":"2026-05-04T01:57:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Gaussian Kernel Attention replaces learned QKV projections with a Gaussian RBF kernel on per-head token features, using 0.42x parameters and 0.49x FLOPs while showing competitive language modeling performance at depth 20.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02098","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Spherical to Gaussian: A Comparative Analysis of Point Cloud Cropping Strategies in Large-Scale 3D Environments","primary_cat":"cs.CV","submitted_at":"2026-05-03T23:36:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Gaussian and related cropping strategies for point cloud subclouds improve 3D neural network performance over spherical cropping on large outdoor scenes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27085","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient Training on Multiple Consumer GPUs with RoundPipe","primary_cat":"cs.DC","submitted_at":"2026-04-29T18:26:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"RoundPipe achieves near-zero-bubble pipeline parallelism for LLM training on consumer GPUs by dynamically dispatching computation stages round-robin, yielding 1.48-2.16x speedups and enabling 235B model fine-tuning on 8x RTX 4090.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"constraints that restrict model scalability and training effi- ciency.(1)Limitedmemorycapacity:TheVRAMofatypical consumer-grade GPU falls short of training demands. For instance,trainingan8Bmodelneeds128GBformodelstates alone [37], far larger than 24GB for NVIDIA RTX 4090 or 32GB for NVIDIA RTX 5090. Furthermore, expanding context windows for complex tasks [8, 47] (e.g., reasoning, video generation) also inflates activation memory. (2) Slow inter-GPU communication: Consumer-grade GPUs use PCIe interconnects, offering less than 20% of NVLink bandwidth. This physical limitation is further compounded by root com- plex contention in PCIe topologies [12, 21]. To alleviate VRAM constraints during consumer-grade"},{"citing_arxiv_id":"2604.25080","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CacheFlow: Efficient LLM Serving with 3D-Parallel KV Cache Restoration","primary_cat":"cs.DC","submitted_at":"2026-04-28T00:24:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CacheFlow cuts TTFT by 10-62% in batched LLM serving via 3D-parallel KV cache restoration and a two-pointer scheduler that overlaps recompute and I/O.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24678","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Leveraging LLMs for Multi-File DSL Code Generation: An Industrial Case Study","primary_cat":"cs.SE","submitted_at":"2026-04-27T16:38:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Fine-tuning 7B code LLMs on a custom multi-file DSL dataset achieves structural fidelity of 1.00, high exact-match accuracy, and practical utility validated by expert survey and execution checks.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"org/abs/2107.03374 [6] Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arXiv:1604.06174 [cs.LG] https://arxiv.org/ abs/1604.06174 [7] Tri Dao. 2023. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. arXiv:2307.08691 [cs.LG] https://arxiv.org/abs/2307.08691 [8] Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher Ré. 2022. FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. arXiv:2205.14135 [cs.LG] https://arxiv.org/abs/2205.14135 [9] Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. 2022. LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale."},{"citing_arxiv_id":"2604.22442","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HubRouter: A Pluggable Sub-Quadratic Routing Primitive for Hybrid Sequence Models","primary_cat":"cs.LG","submitted_at":"2026-04-24T10:59:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HubRouter is a sub-quadratic routing primitive using learned hubs that replaces attention layers in hybrid models while delivering competitive perplexity and large throughput gains.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"strictly-causal Hub-GPT comparison in Section 4 is itself not parity but a≈3-PPL gap favouring Jamba. (6) Throughput measurements are on a single RTX 3090 (24 GB), and the Jamba baseline (320 tok/s at seq=1024) is PyTorch-native without custom kernels; production Mamba/SSM implementations (e.g., the officialmamba-ssmCUDA kernels) and FlashAttention [15] would narrow the absolute throughput gap, possibly substantially. As a Fermi estimate: published Mamba CUDA kernels report 5-10×speedups over Python selective scan at this scale, and FlashAttention-2 [16] typically delivers 2-4×over naive attention; a kernel-optimized Jamba would therefore plausibly reach∼2000-3000 tok/s at seq=1024, reducing the"},{"citing_arxiv_id":"2604.21027","ref_index":73,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:18:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HypEHR is a hyperbolic embedding model for EHR data that uses Lorentzian geometry and hierarchy-aware pretraining to answer clinical questions nearly as well as large language models but with much smaller size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19241","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniEP: Unified Expert-Parallel MoE MegaKernel for LLM Training","primary_cat":"cs.DC","submitted_at":"2026-04-21T08:49:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UniEP fuses MoE communication and computation into unified MegaKernels with deterministic token ordering, delivering 1.03x-1.38x speedups over prior work while preserving training accuracy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"mitigating communication bottlenecks while maintaining the rigorous accuracy standards required for production LLM training. Date:April 22, 2026 1 Introduction The landscape of deep learning has been fundamentally altered by the emergence of mixture-of-experts (MoE) architectures, which increases model size at scalable training and inference cost. State-of-the-art large language models (LLMs), including GPT-5 [35], Gemini3 Pro [9], DeepSeek-V3 [14], and Qwen3 [42] have universally adopted MoE designs to scale parameter counts into hundreds of billions while maintaining manageable activation budgets. This architectural paradigm extends beyond text, underpinning the latest multimodal models such as Qwen-VL [1] and DeepSeek-VL [22]. As the number of experts per layer grows (typically ranging from 64 to 384), expert parallelism (EP) has become the de facto standard for distributing"},{"citing_arxiv_id":"2604.19060","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reinforcement Learning Improves LLM Accuracy and Reasoning in Disease Classification from Radiology Reports","primary_cat":"cs.AI","submitted_at":"2026-04-21T04:09:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SFT followed by GRPO improves LLM accuracy and reasoning recall in disease classification from radiology reports on three radiologist-annotated datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18655","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unlocking the Edge deployment and ondevice acceleration of multi-LoRA enabled one-for-all foundational LLM","primary_cat":"cs.DC","submitted_at":"2026-04-20T07:36:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A framework combines multi-LoRA runtime switching, multi-stream stylistic decoding, and Dynamic Self-Speculative Decoding with INT4 quantization to achieve 4-6x memory and latency gains for on-device inference of a one-for-all foundational LLM on Qualcomm chipsets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16864","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HieraSparse: Hierarchical Semi-Structured Sparse KV Attention","primary_cat":"cs.DC","submitted_at":"2026-04-18T06:28:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HieraSparse delivers a hierarchical semi-structured sparse KV attention system that achieves 1.2x KV compression and 4.57x decode attention speedup versus prior unstructured sparsity methods at equivalent sparsity, plus up to 1.85x prefill speedup and 1.37x/1.77x speedups with magnitude pruning and ","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"ing), which has been widely adopted in popular inference engines likevLLMandSGLang[39], [40]. Frontier works like CacheBlend[41],PromptCache[42], andCacheLink[43] also try to reuse KV Cache at a finer-grained scope, especially when the sharing-prefix condition can not be met. Another batch of works, including different methods of KV Cache quantization and pruning [44], [45], tried to optimize the KV Cache by utilizing its internal numeric characteristics. As shown in Figure 2, some channels in keys consistently exhibit large magnitude across all tokens, while value cache tends to have more uniform small magnitude without a distinguishable pattern.ThinK[17] leveraged this observation to remove trivial channels in both query and value matrices, reducing the overall"},{"citing_arxiv_id":"2604.11288","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Transactional Attention: Semantic Sponsorship for KV-Cache Retention","primary_cat":"cs.CL","submitted_at":"2026-04-13T10:51:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Transactional Attention uses semantic sponsorship from anchor patterns to retain dormant critical tokens in KV caches, achieving 100% credential retrieval at 16 tokens where all prior methods fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10180","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Tessera: Unlocking Heterogeneous GPUs through Kernel-Granularity Disaggregation","primary_cat":"cs.DC","submitted_at":"2026-04-11T12:19:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Tessera performs kernel-granularity disaggregation on heterogeneous GPUs, achieving up to 2.3x throughput and 1.6x cost efficiency gains for large model inference while generalizing beyond prior methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tifying memory access patterns is straightforward because of their deterministic semantics. For example, it is easy to iden- tify thatcublasSgemm(.,A,B,.,C,.)readsbufferA andbufferBand writesbufferC, with the buffer sizes given in the API parameters. In contrast, opaque kernels lack exposed semantics, making their memory access patterns difficult to infer. Prior work, as represented by PhoenixOS [27], identifies accessed buffers through GPU API argument speculation and runtime validation. While this extracts buffer base addresses, it suffers from a fundamental limitation. Modern AI frameworks such as PyTorch [17] employ internal memory managers (e.g., the caching allocator [28]) that virtualize GPU memory independently of the CUDA runtime. Because CUDA allo-"},{"citing_arxiv_id":"2603.01960","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TiledAttention: a CUDA Tile SDPA Kernel for PyTorch","primary_cat":"cs.LG","submitted_at":"2026-03-02T15:11:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TiledAttention is a cuTile-based SDPA kernel that balances performance with Python-level customizability for attention research in PyTorch.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.15166","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Fast and Fusiest: An Optimal Fusion-Aware Mapper for Accelerator Design","primary_cat":"cs.AR","submitted_at":"2026-02-16T20:08:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"FFM finds optimal fused mappings for tensor accelerators over 10,000 times faster than prior mappers while cutting energy-delay product by up to 1.8x versus hand-tuned designs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.05695","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SweetSpot: An Analytical Model for Predicting Energy Efficiency of LLM Inference","primary_cat":"cs.AI","submitted_at":"2026-02-05T14:21:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SweetSpot is an analytical model from Transformer computational and memory complexity that identifies energy minima at short-to-moderate inputs and medium outputs, achieving 1.79% MAPE on H100 GPU measurements across multiple LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.03092","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SnapStream: Efficient Long Sequence Decoding on Dataflow Accelerators","primary_cat":"cs.AI","submitted_at":"2025-11-05T00:38:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SnapStream deploys sparse KV attention in a production inference system on dataflow accelerators, delivering 4x on-chip memory savings for DeepSeek-671B at 128k context with up to 1832 tokens/sec and minimal accuracy loss on LongBench-v2, AIME24, and LiveCodeBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.02043","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flashlight: PyTorch Compiler Extensions to Accelerate Attention Variants","primary_cat":"cs.LG","submitted_at":"2025-11-03T20:25:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Flashlight is a compiler-native PyTorch framework that generates efficient fused kernels for arbitrary and data-dependent attention variants, supporting more cases than FlexAttention with competitive performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.21623","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OjaKV: Context-Aware Online Low-Rank KV Cache Compression","primary_cat":"cs.CL","submitted_at":"2025-09-25T21:42:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OjaKV introduces hybrid full-rank storage for key tokens combined with online low-rank KV cache compression via Oja's algorithm to support memory-efficient long-context LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.19729","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Amoeba: Runtime Tensor Parallel Transformation for LLM Inference Services","primary_cat":"cs.DC","submitted_at":"2025-09-24T03:15:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Amoeba adaptively adjusts tensor parallelism at runtime for LLM inference services to handle mixed short and long context requests, delivering 1.75x-6.57x throughput gains over prior solutions in real-world trace evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.13047","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Model Synthetic Training for Mission-Critical Small Language Models","primary_cat":"cs.CL","submitted_at":"2025-09-16T13:04:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Fine-tunes Qwen2.5-7B on 21,543 synthetic maritime Q&A pairs generated from 3.2B AIS records by GPT-4o and o3-mini, reaching 75% accuracy at 261x lower inference cost than larger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.12635","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Positional Encoding via Token-Aware Phase Attention","primary_cat":"cs.CL","submitted_at":"2025-09-16T03:53:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TAPA adds a learnable phase function to attention to preserve long-range token interactions, enabling direct continual pretraining, length extrapolation, lower perplexity, and stronger retrieval than RoPE-style methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.19190","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hits to Higgs: Hit-Level Higgs Classification from Raw LHC Detector Data Using Higgsformer","primary_cat":"hep-ph","submitted_at":"2025-08-26T16:54:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Higgsformer achieves AUC 0.855 on t tbar H vs t tbar classification from raw hits, matching a Delphes-based Particle Transformer at ~40% b-tagging efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.10013","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training LLMs on HPC Systems: Best Practices from the OpenGPT-X Project","primary_cat":"cs.DC","submitted_at":"2025-04-14T09:17:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Engineering report detailing HPC infrastructure, software choices, and performance measurements for training a 7B LLM using 3D parallelism on JUWELS Booster.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.05564","ref_index":101,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TabICL: A Tabular Foundation Model for In-Context Learning on Large Data","primary_cat":"cs.LG","submitted_at":"2025-02-08T13:25:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TabICL scales in-context learning to large tabular data via column-then-row attention for row embeddings followed by a transformer, matching TabPFNv2 speed and performance while outperforming it and CatBoost on datasets over 10K samples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.05171","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach","primary_cat":"cs.LG","submitted_at":"2025-02-07T18:55:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A recurrent-depth architecture enables language models to improve reasoning performance by iterating computation in latent space, achieving gains equivalent to much larger models on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.10819","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:59:58+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DuoAttention identifies retrieval heads requiring full KV cache and streaming heads using constant-length cache to reduce memory and latency in long-context LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.01990","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Learning Alternatives of the Kolmogorov Superposition Theorem","primary_cat":"cs.LG","submitted_at":"2024-10-02T19:53:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ActNet is a new KST-based neural network that outperforms KANs and competes with MLPs in PINN benchmarks for PDE simulation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.09577","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FlashNorm: Fast Normalization for Transformers","primary_cat":"cs.LG","submitted_at":"2024-07-12T00:37:55+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FlashNorm is an exact algebraic reformulation of RMSNorm plus linear projection that folds weights and defers normalization to allow parallel execution, plus scale-invariance simplifications that remove redundant norms in certain architectures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.18059","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval","primary_cat":"cs.CL","submitted_at":"2024-01-31T18:30:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RAPTOR introduces a tree-organized retrieval method using recursive abstractive summaries, achieving a 20% absolute accuracy improvement on the QuALITY benchmark when paired with GPT-4.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.17453","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient Streaming Language Models with Attention Sinks","primary_cat":"cs.CL","submitted_at":"2023-09-29T17:59:56+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"StreamingLLM lets finite-window LLMs generalize to infinite-length sequences by retaining initial-token KV states as attention sinks, enabling stable streaming inference up to 4M tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}