{"total":138,"items":[{"citing_arxiv_id":"2606.24102","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PORTER: Language-Grounded Event Representations for Portable Structured EHR Foundation Models","primary_cat":"cs.CL","submitted_at":"2026-06-23T03:33:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PORTER is a language-grounded EHR foundation model that uses text descriptions for events and a numeric pathway, matching fixed-vocabulary performance on 74 tasks while recovering 97.1% AUROC on unseen vocabularies and outperforming on MIMIC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08674","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BioVid: Autoregressive Video Generation with Biological Behavior Semantic Comprehension","primary_cat":"cs.CV","submitted_at":"2026-06-07T15:23:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BioVid is a data-driven autoregressive model using 2D-encode/3D-decode tokenization and causal Transformer with EOS termination that reproduces real action duration distributions (W1 distance 1.24 frames) on NTU RGB+D drinking clips, outperforming fixed-length baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31589","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Recognizing Co-Speech Gestures in-the-Wild","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:55:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the first large-scale GRW dataset for semantic co-speech gesture classification, word recognition, and temporal localization in unconstrained videos, along with benchmarks for the three tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31535","ref_index":63,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RayDer: Scalable Self-Supervised Novel View Synthesis from Real-World Video","primary_cat":"cs.CV","submitted_at":"2026-05-29T16:50:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RayDer is a unified transformer backbone for self-supervised static-scene novel view synthesis that absorbs dynamic content as a nuisance factor and shows power-law scaling with data and compute while matching supervised methods in zero-shot settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31530","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UNISON: A Unified Sound Generation and Editing Framework via Deep LLM Fusion","primary_cat":"eess.AS","submitted_at":"2026-05-29T16:43:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UNISON introduces a unified latent diffusion framework with layer-wise LLM fusion and channel-mask task encoding for multiple speech and sound generation and editing tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30981","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cognitive Fatigue in Autoregressive Transformers: Formalization and Measurement","primary_cat":"cs.CL","submitted_at":"2026-05-29T08:18:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Autoregressive transformers exhibit measurable cognitive fatigue during extended generation, quantified by the Fatigue Index that predicts degradation (AUROC 0.95) and repetition (rho 0.94).","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30409","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SANA-Streaming: Real-time Streaming Video Editing with Hybrid Diffusion Transformer","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SANA-Streaming delivers 1280x704 streaming video editing at 24 FPS end-to-end on an RTX 5090 using hybrid DiT blocks, cycle-reverse training, and mixed-precision quantization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30022","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Give it Space! Explicit Disentangling of Positional and Semantic Representations in Encoders","primary_cat":"cs.CL","submitted_at":"2026-05-28T14:42:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Explicitly disentangling semantic and positional streams in a Transformer encoder reveals that absolute positional representations collapse to a 2D document-structure manifold, attention heads specialize by role, and the approach improves linguistic probing performance on 49 of 65 phenomena.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23061","ref_index":77,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anytime Training with Schedule-Free Spectral Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-21T21:50:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SF-NorMuon is a new schedule-free spectral optimizer that closes the gap with tuned AdamW on 125M-772M parameter models across 1-8x Chinchilla horizons while providing stationarity guarantees.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22344","ref_index":64,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bernini: Latent Semantic Planning for Video Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-21T11:30:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bernini is a framework that uses an MLLM planner to output semantic representations for a DiT renderer to generate or edit videos, reporting SOTA benchmark performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22884","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tensor Cache: Eviction-conditioned Associative Memory for Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-21T00:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Tensor Cache augments sliding-window attention with an eviction-fed outer-product associative memory and a training correction to improve long-context performance under bounded memory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21842","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Energy-Gated Attention: Spectral Salience as an Inductive Bias for Transformer Attention","primary_cat":"cs.LG","submitted_at":"2026-05-21T00:21:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Energy-Gated Attention improves language model validation loss by gating attention according to spectral energy of key embeddings discovered by a learned projection, with consistent gains on TinyShakespeare and Penn Treebank using under 0.26% extra parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20659","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RoPeSLR: 3D RoPE-driven Sparse-LowRank Attention for Efficient Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-20T03:24:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoPeSLR combines 3D RoPE-guided sparse attention with head-wise low-rank parameterization to achieve sub-quadratic complexity in DiTs while preserving distance awareness for efficient ultra-long video synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20182","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Atoms of Thought: Universal EEG Representation Learning with Microstates","primary_cat":"cs.LG","submitted_at":"2026-05-19T17:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Microstate tokenizer from clustered EEG signals provides universal representations that outperform traditional time- and frequency-domain features across sleep staging, emotion recognition, and motor imagery tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19944","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Measure-Theoretic Analysis of Reasoning: Structural Generalization and Approximation Limits","primary_cat":"cs.LG","submitted_at":"2026-05-19T15:00:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Applies optimal transport to bound OOD generalization error in Transformers via Lipschitz continuity and TC^0 circuit depth lower bounds for Dyck-k backtracking, supported by evaluations on 54 configurations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18541","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LESSViT: Robust Hyperspectral Representation Learning under Spectral Configuration Shift","primary_cat":"cs.CV","submitted_at":"2026-05-18T15:22:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LESSViT introduces a low-rank efficient spatial-spectral attention mechanism and a hyperspectral masked autoencoder to improve generalization across spectral configuration shifts in hyperspectral imagery.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18128","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"POST: Prior-Observation Adversarial Learning of Spatio-Temporal Associations for Multivariate Time Series Anomaly Detection","primary_cat":"cs.AI","submitted_at":"2026-05-18T09:34:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"POST uses prior-observation adversarial learning on adjacency matrices to reduce spatial over-generalization in graph-based multivariate time series anomaly detection and achieves new SOTA results on detection and channel-wise localization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16184","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Runtime-Orchestrated Second-Order Optimization for Scalable LLM Training","primary_cat":"cs.DC","submitted_at":"2026-05-15T17:03:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Asteria is a runtime system that enables second-order optimization for LLMs by dynamically distributing optimizer state across GPU, CPU, and NVMe while using asynchronous inverse-root computations and bounded-staleness synchronization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15156","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MeMo: Memory as a Model","primary_cat":"cs.CL","submitted_at":"2026-05-14T17:51:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MeMo encodes new knowledge into a separate memory model that integrates with frozen LLMs, showing strong performance on QA benchmarks while avoiding catastrophic forgetting and working without access to model weights.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14854","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FactorizedHMR: A Hybrid Framework for Video Human Mesh Recovery","primary_cat":"cs.CV","submitted_at":"2026-05-14T13:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FactorizedHMR recovers 3D human meshes from video by deterministically anchoring the torso-root then probabilistically completing distal articulations via flow-matching with geometry-aware supervision and a synthetic data pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14457","ref_index":19,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Stateful Reasoning via Insight Replay","primary_cat":"cs.AI","submitted_at":"2026-05-14T06:52:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InsightReplay improves long CoT reasoning by extracting critical insights from the trace and replaying them near the active frontier, delivering +1.65 average accuracy gain across 24 model-benchmark settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13405","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When is Warmstarting Effective for Scaling Language Models?","primary_cat":"cs.LG","submitted_at":"2026-05-13T12:00:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A 2x growth factor in model warmstarting yields reliable training speedups for language models under 20 tokens/parameter budgets, with an empirical upper bound on effective growth factors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13370","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Phasor Memory Networks: Stable Backpropagation Through Time for Scalable Explicit Memory","primary_cat":"cs.LG","submitted_at":"2026-05-13T11:28:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PMNet uses unitary phasor dynamics and hierarchical anchors to make explicit memory stable for long sequences, matching a 3x larger Mamba model on long-context robustness with a 119M parameter network.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12938","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CRePE: Curved Ray Expectation Positional Encoding for Unified-Camera-Controlled Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:18:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CRePE supplies depth-aware positional distributions along curved rays for stable unified-camera control in frozen video DiT models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12697","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Unified Framework for Critical Scaling of Inverse Temperature in Self-Attention","primary_cat":"stat.ML","submitted_at":"2026-05-12T19:48:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"The upper-tail accumulation scale derived from the gap-counting function N_n sets the critical inverse temperature for softmax attention concentration, unifying prior conflicting laws as special cases of different N_n.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12651","ref_index":80,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Runtime Monitoring of Perception-Based Autonomous Systems via Embedding Temporal Logic","primary_cat":"cs.LG","submitted_at":"2026-05-12T18:57:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Embedding Temporal Logic (ETL) performs runtime monitoring directly in learned embedding spaces using distance-based predicates composed with temporal operators, supported by conformal calibration for reliable predicate evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10537","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mela: Test-Time Memory Consolidation based on Transformation Hypothesis","primary_cat":"cs.CL","submitted_at":"2026-05-11T13:20:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mela is a Transformer variant with a dual-frequency Hierarchical Memory Module and MemStack that performs test-time memory consolidation, outperforming baselines on long contexts.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"386-389, 1970. Aakash Lahoti, Kevin Li, Berlin Chen, Caitlin Wang, Aviv Bick, J Zico Kolter, Tri Dao, and Albert Gu. Mamba-3: Improved sequence modeling using state space principles. InThe Fourteenth International Conference on Learning Representations, 2026. Joseph E LeDoux and Hakwan Lau. Seeing consciousness through the lens of memory.Current biology, 30(18): R1018-R1022, 2020. Jingyuan Liu, Jianlin Su, Xingcheng Yao, Zhejun Jiang, Guokun Lai, Yulun Du, Yidao Qin, Weixin Xu, Enzhe Lu, Junjie Yan, et al. Muon is scalable for llm training.arXiv preprint arXiv:2502.16982, 2025. Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization.arXiv preprint arXiv:1711.05101, 2017. Lingchen Meng, Jianwei Yang, Rui Tian, Xiyang Dai, Zuxuan Wu, Jianfeng Gao, and Yu-Gang Jiang."},{"citing_arxiv_id":"2605.09949","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Syntax to Semantics: Unveiling the Emergence of Chirality in SMILES Translation Models","primary_cat":"cs.LG","submitted_at":"2026-05-11T03:53:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chirality emerges in SMILES translation models through an abrupt encoder-centered reorganization of representations after a long plateau, identified via checkpoint analysis and ablation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Zhang, Peng Wang, Qin Zhu, Rui Men, Ruize Gao, Shixuan Liu, Shuang Luo, Tianhao Li, Tianyi Tang, Wenbiao Yin, Xingzhang Ren, Xinyu Wang, Xinyu Zhang, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yinger Zhang, Yu Wan, Yuqiong Liu, Zekun Wang, Zeyu Cui, Zhenru Zhang, Zhipeng Zhou, and Zihan Qiu. Qwen3 technical report.arXiv [cs.CL], 2025. doi:10.48550/arXiv.2505.09388. [41] Gemma Team, Thomas Mesnard, Cassidy Hardin, Robert Dadashi, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivière, Mihir Sanjay Kale, Juliette Love, Pouya Tafti, Léonard Hussenot, Pier Giuseppe Sessa, Aakanksha Chowdhery, Adam Roberts, Aditya Barua, Alex Botev, Alex Castro-Ros, Ambrose Slone, Amélie Héliou, Andrea Tacchetti, Anna Bulanova, Antonia Paterson, Beth Tsai, Bobak Shahriari, Charline Le Lan,"},{"citing_arxiv_id":"2605.09449","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SpaceMind++: Toward Allocentric Cognitive Maps for Spatially Grounded Video MLLMs","primary_cat":"cs.CV","submitted_at":"2026-05-10T10:01:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpaceMind++ adds an explicit voxelized allocentric cognitive map and coordinate-guided fusion to video MLLMs, claiming SOTA on VSI-Bench and improved out-of-distribution generalization on three other 3D benchmarks.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"features and 3D coordinates into a voxelized cognitive map. CDIF then alternates between intra-map reasoning and map reading to inject spatial knowledge back into semantic tokens. Both stages are guided by coordinate embeddings and 3D RoPE to preserve metric 3D relationships. to the voxel feature v(l) j , then apply 3D Continuous Rotary Positional Embedding (3D RoPE) [50] to make the queries and keys coordinate-aware. This enables attention to model relative spatial relationships in metric 3D space: ˜v(l) j =v (l) j + MLP(pj), ˆq(l) j , ˆk(l) j = RoPE(Linear( ˜v(l) j ),p j). (8) The voxel map is then updated via self-attention: V (l+1) = SelfAttn( ˆQ, ˆK,V(V (l))).(9) This step enables the cognitive map to perform global geometric inference, such as modeling object-"},{"citing_arxiv_id":"2605.11007","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RT-Transformer: The Transformer Block as a Spherical State Estimator","primary_cat":"cs.LG","submitted_at":"2026-05-10T08:14:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Transformer components arise as the natural solution to precision-weighted directional state estimation on the hypersphere.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09165","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sparse Layers are Critical to Scaling Looped Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-09T20:58:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Looped MoE models scale better than standard transformers because different experts activate on each loop pass, recovering expressivity without extra parameters, and support superior early exits.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"resulting Looped-MoE architecture scales better than a standard dense LM, stores fewer weights for the same accuracy, and enables more compute savings via early exits at loop boundaries. 2 Model Descriptions The architectures compared in this work share a common backbone: a decoder-only transformer with multi-head self-attention (MHSA) using rotary positional embeddings (RoPE) [10], SwiGLU feed-forward networks [11], pre-RMSNorm [12], and a residual stream (Figure 8). To explore the impact of sparse layers on looped scaling laws, we vary model architecture by whether the FFN is dense or sparse and whether the layers are looped (Table 1). 2.1 Layer Looping Table 1: Scaling Study Configurations. Architecture FFN Layers Looped Dense8×2 Base Dense 16 Looped-MoE Sparse8×2"},{"citing_arxiv_id":"2605.09126","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cosine-Gated Adam-Decay: Drop-In Staleness-Aware Outer Optimization for Decoupled DiLoCo","primary_cat":"cs.LG","submitted_at":"2026-05-09T19:16:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CGAD is a staleness-aware Adam variant for DiLoCo that gates gradients with cosine and exponential decay, proves a convergence bound independent of maximum delay, and demonstrates stable pretraining of 25M to 7B parameter Llama-style models across controlled delays.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06225","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory Inception: Latent-Space KV Cache Manipulation for Steering LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-07T13:19:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Memory Inception is a training-free method that injects latent KV banks at chosen layers to steer LLMs, achieving superior control-drift balance and up to 118x storage reduction on personality and structured-reasoning tasks.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"KV Cache Memory Bank.A memory bank is a small collection of latent KV slots. BankbcontainsM b slots, B(b) ={(k(b) m ,v (b) m )}Mb m=1, K (b) = [k(b) 1 ,...,k (b) Mb]⊤, V (b) = [v(b) 1 ,...,v (b) Mb]⊤. 3 At a selected site, MI conceptually augments the prompt cache with these slots: K⋆ t = concat(Kx ≤t,K (1),...,K (B)), V ⋆ t = concat(Vx ≤t,V (1),...,V (B)).(2) The resulting selected-site output iso⋆ t = Attn(qt,K⋆ t,V ⋆ t ). This augmented-cache view is the reference formulation used throughout the paper. In implementation, the ordinary prompt cache remains intact and the reminder bank is consumed as a side bank only at selected layers and units; Appendix B.1 gives the backend-specific Qwen3 details. This"},{"citing_arxiv_id":"2605.05683","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spectral Lens: Activation and Gradient Spectra as Diagnostics of LLM Optimization","primary_cat":"stat.ML","submitted_at":"2026-05-07T05:19:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spectral analysis of activations and gradients provides new diagnostics that link batch size to representation geometry, early covariance tails to token efficiency, and spectral shifts to learning dynamics in decoder-only LLMs, backed by a mechanistic model.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"github.io/posts/muon/. [48] KellerJordan/modded-nanogpt contributors. modded-nanogpt record 3: Introduced the muon optimizer, 2024. URL https://github.com/KellerJordan/modded-nanogpt#world-recor d-history. The repository's world-record table lists record 3, but says no log is available; the originally uploaded 2024-10-04_Muon path did not resolve. [49] KellerJordan/modded-nanogpt contributors. modded-nanogpt record 8: Untied embedding and head, 2024. URL https://github.com/KellerJordan/modded-nanogpt/blob/master/recor ds/track_1_short/2024-11-03_UntieEmbed/d6b50d71-f419-4d26-bb39-a60d55ae7a04.tx t. [50] KellerJordan/modded-nanogpt contributors. modded-nanogpt record 9: Value and embedding skip connections, momentum warmup, logit softcap, 2024."},{"citing_arxiv_id":"2605.05394","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BARFI-Q: Quantum-Enhanced Block Attention Residual Fusion Framework for Multivariate Time-Series Forecasting in Atom Interferometry","primary_cat":"quant-ph","submitted_at":"2026-05-06T19:26:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"BARFI-Q integrates patch-based embedding, dual-branch temporal modeling, hierarchical fusion, adaptive block-attention residuals, and quantum feature mapping to forecast atom interferometry time-series, outperforming baselines while representing targets in circular sine-cosine space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05341","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Feature Starvation as Geometric Instability in Sparse Autoencoders","primary_cat":"cs.LG","submitted_at":"2026-05-06T18:11:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adaptive elastic net SAEs (AEN-SAEs) mitigate feature starvation in SAEs by combining ℓ2 structural stability with adaptive ℓ1 reweighting, producing a Lipschitz-continuous sparse coding map that recovers global feature support under mild assumptions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Jumping ahead: Improving reconstruction fidelity with jumprelu sparse autoencoders, 2024. URLhttps://doi.org/10.48550/arXiv.2407.14435. [34] S. M. Robinson. Some continuity properties of polyhedral multifunctions.Mathemati- cal Programming at Oberwolfach, 14:206-214, 1981. URL https://doi.org/10.1007/ BFb0120929. [35] T. Rockafellar and R. Wets.Variational analysis. Springer, 1998. URL https://doi.org/ 10.1007/978-3-642-02431-3. [36] J. Su, Y . Lu, S. Pan, A. Murtadha, B. Wen, and Y . Liu. RoFormer: Enhanced transformer with rotary position embedding, 2023. URLhttps://doi.org/10.48550/arXiv.2104.09864. [37] R. Tibshirani. Regression shrinkage and selection via the Lasso.Journal of the Royal Statistical Society: Series B (Methodological), 58(1):267-288, 1996. URL https://doi.org/10."},{"citing_arxiv_id":"2605.04217","ref_index":4,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Jordan-RoPE: Non-Semisimple Relative Positional Encoding via Complex Jordan Blocks","primary_cat":"cs.LG","submitted_at":"2026-05-05T18:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Jordan-RoPE realizes a distance-modulated phase basis via non-semisimple Jordan blocks, generating features such as d e^{iωd} for relative positional encoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02568","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StreamIndex: Memory-Bounded Compressed Sparse Attention via Streaming Top-k","primary_cat":"cs.LG","submitted_at":"2026-05-04T13:19:29+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Chunked streaming top-k enables CSA indexer execution at 1M sequence length with 6.21 GB peak memory and >=0.998 recall on synthetic V4-shaped inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02075","ref_index":35,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Graph Transformers and Stabilized Reinforcement Learning for Large-Scale Dynamic Routing Modulation and Spectrum Allocation in Elastic Optical Networks","primary_cat":"cs.NI","submitted_at":"2026-05-03T22:26:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A graph transformer with RL stabilizations is the first to exceed benchmarks for dynamic RMSA, supporting up to 13% more traffic load on networks up to 143 nodes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00662","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Spiking Sequence Machines and Transformers","primary_cat":"cs.NE","submitted_at":"2026-05-01T13:45:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Spiking SDM and transformers implement identical functional operations for sequences via cosine similarity retrieval, unified by a phase-latency isomorphism between spike timing and sinusoidal positional encoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.25819","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mutual Forcing: Dual-Mode Self-Evolution for Fast Autoregressive Audio-Video Character Generation","primary_cat":"cs.CV","submitted_at":"2026-04-28T16:28:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mutual Forcing trains a single native autoregressive audio-video model with mutually reinforcing few-step and multi-step modes via self-distillation to match 50-step baselines at 4-8 steps.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"attention computation across the two branches, allowing audio and video tokens to attend to each other directly. We first pre-train the two branches separately and then jointly fine-tune them end-to-end, producing synchronized audio and video predictions for streaming generation. 4 3D RoPE Embedding for Streaming Representations.To distinguish multimodal positional information, we introduce a 3D RoPE [39] encoding that factorizes position into tempo- ral, height, and width coordinates. We apply it to video, au- dio, and text tokens; for audio and text, the height and width coordinates are set to 0. All positions are computed from the actual timestamps of the corresponding audio, video, and text, ensuring temporal alignment across modalities. Two-stage Training Strategy."},{"citing_arxiv_id":"2604.25786","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Homogeneous Stellar Parameters from Heterogeneous Spectra with Deep Learning","primary_cat":"astro-ph.GA","submitted_at":"2026-04-28T15:54:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A single end-to-end Transformer model unifies stellar labels from heterogeneous spectroscopic surveys into a self-consistent scale without post-hoc recalibration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24809","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nautile-370M: Spectral Memory Meets Attention in a Small Reasoning Model","primary_cat":"cs.LG","submitted_at":"2026-04-27T08:07:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Nautile-370M is a hybrid small language model using SeqCond Attention layers alternating with transformers, with a claimed proof that the spectral operator matches full self-attention expressiveness in the continuous limit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23811","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Attention Is Not All You Need for Diffraction","primary_cat":"cond-mat.mtrl-sci","submitted_at":"2026-04-26T17:22:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Physics-informed transformer with sin^2(theta) encoding, physics-aware positional encoding, multi-task decoder, and three-stage curriculum classifies powder diffraction into 99 extinction groups, with structured errors on symmetry subgroup hierarchy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23434","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Does Removing LayerNorm Help? Activation Bounding as a Regime-Dependent Implicit Regularizer","primary_cat":"cs.LG","submitted_at":"2026-04-25T20:12:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DyT improves validation loss 27% at 64M params/1M tokens but worsens it 19% at 118M tokens, with saturation levels predicting the sign of the effect.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22554","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video Analysis and Generation via a Semantic Progress Function","primary_cat":"cs.CV","submitted_at":"2026-04-24T13:48:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A Semantic Progress Function is defined as a 1D curve of cumulative semantic shifts from frame embeddings, supporting a linearization procedure that retimes video sequences for constant-rate semantic evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21182","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"WildSplatter: Feed-forward 3D Gaussian Splatting with Appearance Control from Unconstrained Images","primary_cat":"cs.CV","submitted_at":"2026-04-23T00:58:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"WildSplatter jointly learns 3D Gaussians and appearance embeddings from unconstrained photo collections to enable fast feed-forward reconstruction and flexible lighting control in 3D Gaussian Splatting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21079","ref_index":50,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Foveated Reasoning: Stateful, Action-based Visual Focusing for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-04-22T20:44:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Foveated Reasoner integrates foveation as stateful actions inside the autoregressive decoding loop of vision-language models, trained via cold-start supervision then reinforcement learning to achieve higher accuracy at low token budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21035","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Masked-Token Prediction for Anomaly Detection at the Large Hadron Collider","primary_cat":"hep-ph","submitted_at":"2026-04-22T19:29:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The work demonstrates masked-token prediction with transformers for model-independent anomaly detection in LHC data, achieving strong results on top-rich BSM signatures like four-top production using VQ-VAE tokenization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21027","ref_index":143,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:18:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HypEHR is a hyperbolic embedding model for EHR data that uses Lorentzian geometry and hierarchy-aware pretraining to answer clinical questions nearly as well as large language models but with much smaller size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}