{"total":58,"items":[{"citing_arxiv_id":"2605.23258","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"A Simple Plug-in for Improving Eviction-Based KV Cache Compression","primary_cat":"cs.LG","submitted_at":"2026-05-22T06:00:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VECTOR augments eviction-based KV cache compression with three-way token routing that combines importance scoring and offline regression-based reconstructability estimation to improve quality at high compression ratios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22668","ref_index":21,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SEGA: Spectral-Energy Guided Attention for Resolution Extrapolation in Diffusion Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-21T16:09:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEGA adaptively scales RoPE attention components using spectral-energy guidance from the latent to improve structural coherence and fine details in high-resolution DiT synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21981","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RiT: Vanilla Diffusion Transformers Suffice in Representation Space","primary_cat":"cs.CV","submitted_at":"2026-05-21T04:21:43+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A vanilla Diffusion Transformer trained via x-prediction on frozen DINOv2 features reaches FID 1.14 on ImageNet 256x256 with fewer parameters and faster sampling than prior DiT variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21975","ref_index":52,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Reasoning through Verifiable Forecast Actions: Consistency-Grounded RL for Financial LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T04:09:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StockR1 unifies LLM-based financial reasoning and time-series forecasting by emitting verifiable forecast actions that condition a decoder, optimized via consistency-grounded RL to improve accuracy on QA and prediction tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21573","ref_index":21,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Lens: Rethinking Training Efficiency for Foundational Text-to-Image Models","primary_cat":"cs.CV","submitted_at":"2026-05-20T17:59:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Lens is a 3.8B-parameter text-to-image model that reaches competitive or superior performance to >6B-parameter systems using 19.3% of the training compute of Z-Image through a densely captioned 800M dataset, multi-resolution batching, semantic VAE, strong language encoder, RL fine-tuning, and 4-step","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20134","ref_index":21,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TrajTok: Adaptive Spatial Tokenization for Trajectory Representation Learning","primary_cat":"cs.LG","submitted_at":"2026-05-19T17:18:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TrajTok learns multi-resolution hexagonal spatial tokens from GPS data and pretrains a factorized transformer with ST-RoPE and masked modeling to yield frozen encoders that outperform task-specific methods on similarity, classification, and travel-time tasks in the Porto dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19376","ref_index":46,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Generative Recursive Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-19T05:20:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRAM is a latent-variable generative model that performs recursive reasoning via stochastic trajectories, trained with amortized variational inference to support multi-hypothesis reasoning and unconditional generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18735","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PIXLRelight: Controllable Relighting via Intrinsic Conditioning","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:55:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A transformer-based neural renderer that transfers arbitrary PBR lighting to single images via shared intrinsic conditioning extracted from both multi-illumination photos and path-traced coarse 3D renders.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18553","ref_index":41,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"StableHand: Quality-Aware Flow Matching for World-Space Dual-Hand Motion Estimation from Egocentric Video","primary_cat":"cs.CV","submitted_at":"2026-05-18T15:33:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StableHand introduces a quality-aware flow matching framework conditioned on predicted four-channel per-frame hand observation quality to estimate dual-hand world-space motion from egocentric video, achieving SOTA results with 20-25% W-MPJPE reduction on HOT3D and ARCTIC benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18331","ref_index":35,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Prune, Update and Trim: Robust Structured Pruning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-18T12:48:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Putri is a structured pruning technique for LLMs that compensates for pruning errors via weight updates and sequential processing while pruning at the attention-head level to reach state-of-the-art results at extreme sparsity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17109","ref_index":34,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DynMuon: A Dynamic Spectral Shaping View of Muon","primary_cat":"cs.LG","submitted_at":"2026-05-16T18:30:11+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16003","ref_index":38,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Echo-Forcing: A Scene Memory Framework for Interactive Long Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-15T14:33:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Echo-Forcing decouples stable anchors, compressed history, and recent dynamics in video diffusion KV caches using hierarchical memory, scene recall frames, and difference-aware decay to support interactive long video generation under bounded cache.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15741","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HyperDiT: Hyper-Connected Transformers for High-Fidelity Pixel-Space Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-15T08:51:55+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14705","ref_index":75,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Towards Continuous Sign Language Conversation from Isolated Signs","primary_cat":"cs.CV","submitted_at":"2026-05-14T11:22:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Constructs continuous sign conversation data from isolated signs using retrieval and diffusion models to train a direct sign-to-sign conversational AI.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"collected from diverse web sources and public datasets, represented in a shared 3D motion space. Although we retrieve a core-signing clip for each sign unit after trimming non-lexical preparation and retraction motions, adjacent clips remain misaligned in duration, boundary pose, and motion trajectory. Unlike prior works on prepared sign transitions [75, 86], our setting requires composing continuous signing from independently collected clips, making both duration alignment and boundary refinement necessary. We therefore proposeBRAID( Boundary Refinement via co- Articulatory Inpainting Diffusion Transformer), which aligns adjacent sign pairs with a predicted duration plan and refines boundary regions through co-articulatory inpainting to produce pseudo-continuous signing."},{"citing_arxiv_id":"2605.13831","ref_index":74,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Training Long-Context Vision-Language Models Effectively with Generalization Beyond 128K Context","primary_cat":"cs.CV","submitted_at":"2026-05-13T17:52:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Continued pre-training with balanced long-document VQA data extends a 7B LVLM to 128K context, improving long-document VQA by 7.1% and generalizing to 512K without further training.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/. [72] bloc97. Add NTK-Aware interpolation \"by parts\" correction, 2023. URLhttps://github.com/jquesnelle/ scaled-rope/pull/1. [73] Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding.Neurocomputing, 568:127063, 2024. [74] Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, et al. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution.arXiv preprint arXiv:2409.12191, 2024. 15 Appendix A Final Recipe and Implementation A.1 Final LongPT Recipe We summarize the final LongPT recipe used for the main results."},{"citing_arxiv_id":"2605.12335","ref_index":63,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"EHR-RAGp: Retrieval-Augmented Prototype-Guided Foundation Model for Electronic Health Records","primary_cat":"cs.IR","submitted_at":"2026-05-12T16:17:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EHR-RAGp is a retrieval-augmented EHR foundation model that employs prototype-guided retrieval to dynamically integrate relevant historical patient context, outperforming prior models on clinical prediction tasks.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"325, 0.369) LongFormer [61] 0.736(0.713, 0.760)0.123(0.105, 0.153)0.922(0.914, 0.929)0.645(0.615, 0.673)0.855(0.846, 0.864)0.546(0.521, 0.570)0.789(0.778, 0.800)0.356(0.336, 0.382) BigBird [62] 0.724(0.700, 0.747)0.133(0.109, 0.164)0.931(0.925, 0.938)0.690(0.667, 0.714)0.864(0.855, 0.874)0.559(0.534, 0.585)0.787(0.777, 0.798)0.355(0.332, 0.377) RoFormer [63] 0.739(0.716, 0.760)0.142(0.116, 0.173)0.935(0.928, 0.942)0.701(0.679, 0.724)0.872(0.863, 0.880)0.568(0.542, 0.593)0.798(0.788, 0.808)0.364(0.342, 0.387) Retrieval-based models REMed [54] 0.535(0.510, 0.561)0.044(0.039, 0.052)0.867(0.857, 0.876)0.468(0.439, 0.499)0.800(0.790, 0.811)0.363(0.342, 0.386)0.622(0.608, 0.636)0.185(0.175, 0.199) VanillaEHR-RAGp0."},{"citing_arxiv_id":"2605.12241","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Pretraining Strategies and Scaling for ECG Foundation Models: A Systematic Study","primary_cat":"eess.SP","submitted_at":"2026-05-12T15:10:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Contrastive predictive coding pretraining combined with structured state space models yields the strongest ECG foundation models, with continued gains from scaling data to 11 million samples.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Backbone architecturesAll models share a common encoder comprising a lightweight CNN stem followed by a sequential backbone, with the encoder fixed across all pretraining objectives to enable controlled comparison. The CNN stem consists of four convolutional layers with batch normaliza- tion. For the backbone, we evaluate three variants: a S4-based backbone [26], a Transformer [25] backbone with RoPE positional encoding [31] and GELU activations [32], and a CNN-based model (Net1D [8]). We further investigate the effect of S4 backbone depth by comparing 4-layer and 6-layer configurations, and conduct a supervised model dimension ablation across dimensions 512, 768, and 1024 with corresponding state dimensions 8, 12, and 16 to determine the optimal capacity for ECG representation learning."},{"citing_arxiv_id":"2605.10268","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MemReread: Enhancing Agentic Long-Context Reasoning via Memory-Guided Rereading","primary_cat":"cs.CL","submitted_at":"2026-05-11T09:30:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MemReread improves agent long-context reasoning by triggering rereading on insufficient final memory to recover discarded indirect facts, outperforming baselines at linear complexity.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[29] Yiran Ding, Li Lyna Zhang, Chengruidong Zhang, Yuanyuan Xu, Ning Shang, Jiahang Xu, Fan Yang, and Mao Yang. Longrope: Extending llm context window beyond 2 million tokens.arXiv preprint arXiv:2402.13753, 2024. [30] Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding.Neurocomputing, 568:127063, 2024. [31] Yuyang Hu, Shichun Liu, Yanwei Yue, Guibin Zhang, Boyang Liu, Fangyi Zhu, Jiahang Lin, Honglin Guo, Shihan Dou, Zhiheng Xi, et al. Memory in the age of ai agents.arXiv preprint arXiv:2512.13564, 2025. [32] Prateek Chhikara, Dev Khant, Saket Aryan, Taranjeet Singh, and Deshraj Yadav. Mem0: Building production-ready ai agents with scalable long-term memory."},{"citing_arxiv_id":"2605.10045","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"ExtraVAR: Stage-Aware RoPE Remapping for Resolution Extrapolation in Visual Autoregressive Models","primary_cat":"cs.CV","submitted_at":"2026-05-11T06:14:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ExtraVAR enables resolution extrapolation in visual autoregressive models by stage-aware RoPE remapping and entropy-driven attention scaling, suppressing repetition and detail loss.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Proceedings of the IEEE/CVF International Conference on Computer Vision, pages 16893-16903, 2025. [29] Jingjing Ren, Wenbo Li, Haoyu Chen, Renjing Pei, Bin Shao, Yong Guo, Long Peng, Fenglong Song, and Lei Zhu. Ultrapixel: Advancing ultra high-resolution image synthesis to new peaks.Advances in Neural Information Processing Systems, 37:111131-111171, 2024. [30] Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding.Neurocomputing, 568:127063, 2024. [31] Peize Sun, Yi Jiang, Shoufa Chen, Shilong Zhang, Bingyue Peng, Ping Luo, and Zehuan Yuan. Autore- gressive model beats diffusion: Llama for scalable image generation.arXiv preprint arXiv:2406."},{"citing_arxiv_id":"2605.09294","ref_index":55,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Towards Effective Theory of LLMs: A Representation Learning Approach","primary_cat":"cs.LG","submitted_at":"2026-05-10T03:42:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RET learns temporally consistent macrovariables from LLM activations via self-supervised learning to support interpretability, early behavioral prediction, and causal intervention.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Nicholas L Turner, Callum McDougall, Monte MacDiarmid, C. Daniel Freeman, Theodore R. Sumers, Edward Rees, Joshua Batson, Adam Jermyn, Shan Carter, Chris Olah, and Tom Henighan. Scaling monosemanticity: Extracting interpretable features from claude 3 sonnet. Transformer Circuits Thread, 2024. URL https://transformer-circuits.pub/2024/ scaling-monosemanticity/index.html. [55] Alexander Matt Turner, Lisa Thiergart, Gavin Leech, David Udell, Juan J Vazquez, Ulisse Mini, and Monte MacDiarmid. Steering language models with activation engineering.arXiv preprint arXiv:2308.10248, 2023. 13 [56] Muhammed Ustaomeroglu, Baris Askin, Gauri Joshi, Carlee Joe-Wong, and Guannan Qu. Internal planning in language models: Characterizing horizon and branch awareness."},{"citing_arxiv_id":"2605.09196","ref_index":34,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RigidFormer: Learning Rigid Dynamics using Transformers","primary_cat":"cs.CV","submitted_at":"2026-05-09T22:31:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RigidFormer learns mesh-free rigid dynamics from point clouds using object-centric anchors, Anchor-Vertex Pooling, Anchor-based RoPE, and differentiable Kabsch alignment to enforce rigidity.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"For an attention head with query/key channels split into a rotary part and a pass-through part,q= [q r;q p]andk= [k r;k p], ARoPE applies ˜q= [q r ⊙cosa q + rot(qr)⊙sina q;q p], ˜k= [k r ⊙cosa k + rot(kr)⊙sina k;k p], where aq and ak are the ARoPE descriptors for the query and key tokens, and rot(·) swaps each even-odd channel pair with a sign flip as in standard RoPE [ 34]. Mean-pooling these per-anchor rotary features-rather than concatenating raw anchor coordinates as in a naive multi-point variant- matches the symmetry that anchor identities are arbitrary, while the encoding still depends on world-frame positions and therefore captures object centroid and shape extent. ARoPE is invariant to anchor reindexing: for any anchor permutation π, ARoPE({x(i)"},{"citing_arxiv_id":"2605.09173","ref_index":43,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"WavesFM: Hierarchical Representation Learning for Longitudinal Wearable Sensor Waveforms","primary_cat":"cs.LG","submitted_at":"2026-05-09T21:22:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"WavesFM uses hierarchical SSL to pretrain a segment encoder on short waveforms followed by a temporal encoder on multi-day sequences, outperforming prior methods on 58 tasks after training on over 12 million hours of data from hundreds of thousands of people.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Biological Psychiatry, 108:110162, 2021. [42] Carissa Gardiner, Jonathon Weakley, Louise M Burke, Gregory D Roach, Charli Sargent, Nirav Maniar, Minh Huynh, Dean J Miller, Andrew Townshend, and Shona L Halson. The effect of alcohol on subsequent sleep in healthy adults: A systematic review and meta-analysis.Sleep Medicine Reviews, 80:102030, 2025. [43] Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding.Neurocomputing, 568:127063, 2024. [44] SeppHochreiterandJürgenSchmidhuber. Longshort-termmemory.Neuralcomputation,9(8):1735-1780, 1997. [45] Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, and Ross Girshick."},{"citing_arxiv_id":"2605.07915","ref_index":77,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"What Matters for Diffusion-Friendly Latent Manifold? Prior-Aligned Autoencoders for Latent Diffusion","primary_cat":"cs.CV","submitted_at":"2026-05-08T15:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prior-Aligned AutoEncoders shape latent manifolds with spatial coherence, local continuity, and global semantics to improve latent diffusion, achieving SOTA gFID 1.03 on ImageNet 256x256 with up to 13x faster convergence.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[75] Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding.Neurocomputing, 568:127063, 2024. [76] Hao Tang, Chenwei Xie, Xiaoyi Bao, Tingyu Weng, Pandeng Li, Yun Zheng, and Liwei Wang. Unilip: Adapting clip for unified multimodal understanding, generation and editing, 2026. URLhttps://arxiv.org/abs/2507.23278. [77] Hansi Teng, Hongyu Jia, Lei Sun, Lingzhi Li, Maolin Li, Mingqiu Tang, Shuai Han, Tianning Zhang, WQ Zhang, Weifeng Luo, et al. Magi-1: Autoregressive video generation at scale. arXiv preprint arXiv:2505.13211, 2025. [78] Shengbang Tong, Boyang Zheng, Ziteng Wang, Bingda Tang, Nanye Ma, Ellis Brown, Jihan Yang, Rob Fergus, Yann LeCun, and Saining Xie."},{"citing_arxiv_id":"2605.07097","ref_index":78,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Every Feedforward Neural Network Definable in an o-Minimal Structure Has Finite Sample Complexity","primary_cat":"stat.ML","submitted_at":"2026-05-08T01:26:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Every fixed finite feedforward neural network definable in an o-minimal structure has finite sample complexity in the agnostic PAC setting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06395","ref_index":96,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Consistent Geometric Deep Learning via Hilbert Bundles and Cellular Sheaves","primary_cat":"cs.LG","submitted_at":"2026-05-07T15:08:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"HilbNets define convolutions via Hilbert bundle connection Laplacians, prove that sampled Hilbert cellular sheaf Laplacians converge to the continuous operator, and show that discretized networks are consistent and transferable across samplings.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[94] Federica Spoto, Alessia Caponera, and Pierpaolo Brutti. Change point detection for functional autoregressive processes on the sphere.arXiv preprint arXiv:2512.03255, 2025. [95] Jianlin Su, Murtadha Ahmed, Yu Lu, Shengfeng Pan, Wen Bo, and Yunfeng Liu. Roformer: Enhanced transformer with rotary position embedding.Neurocomputing, 568:127063, 2024. [96] Shantanu Thakoor, Corentin Tallec, Mohammad Gheshlaghi Azar, Mehdi Azabou, Eva L Dyer, Remi Munos, Petar Veliˇckovi'c, and Michal Valko. Large-scale representation learning on graphs via bootstrapping. InInternational Conference on Learning Representations, 2022. URLhttps://openreview.net/forum?id=0UXT6PpRpW. [97] B. Vallet and B. Lévy. Spectral geometry processing with manifold harmonics."},{"citing_arxiv_id":"2605.06169","ref_index":16,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Mean Mode Screaming: Mean--Variance Split Residuals for 1000-Layer Diffusion Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-07T12:53:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mean-Variance Split residuals separate centered variation from mean updates to prevent collapse and enable stable training of 1000-layer Diffusion Transformers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03780","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Task Vector Geometry Underlies Dual Modes of Task Inference in Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-05T14:07:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"In a controlled synthetic setting, transformers implement in-distribution task inference via convex combinations of task vectors and out-of-distribution inference via nearly orthogonal extrapolative representations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03652","ref_index":30,"ref_count":3,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AniMatrix: An Anime Video Generation Model that Thinks in Art, Not Physics","primary_cat":"cs.CV","submitted_at":"2026-05-05T11:36:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AniMatrix generates anime videos by structuring artistic production rules into a controllable taxonomy and training the model to prioritize those rules over physical realism, achieving top scores from professional animators on prompt understanding and artistic motion.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"coherent, temporally extended videos. In the open-source domain, models such as HunyuanVideo [ 3], Wan 2.2 [4], CogVideoX [6], Open-Sora [27], and SkyReels [8, 9] have rapidly narrowed the gap with proprietary systems such as Kling [5], Seedance [1, 7], and Vidu [28] through scaling data curation [29] and architectural improvements including 3D RoPE [30], Mixture-of-Experts [31], and efficient attention [32], with standard benchmarks such as FVD [ 33], FID [34], and VBench [35] tracking this progress. The effectiveness of this paradigm rests on a single, often unstated premise: natural video implicitly encodes a universal physical prior that diffusion models absorb automatically during training. Despite this progress, all of these models are trained on natural video corpora and optimize for physical"},{"citing_arxiv_id":"2604.26694","ref_index":67,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Unified 4D World Action Modeling from Video Priors with Asynchronous Denoising","primary_cat":"cs.RO","submitted_at":"2026-04-29T14:01:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"X-WAM unifies robotic action execution and 4D world synthesis by adapting video diffusion priors with a lightweight depth branch and asynchronous noise sampling, achieving 79-91% success on robot benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14268","ref_index":59,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HY-World 2.0: A Multi-Modal World Model for Reconstructing, Generating, and Simulating 3D Worlds","primary_cat":"cs.CV","submitted_at":"2026-04-15T17:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"HY-World 2.0 generates and reconstructs high-fidelity navigable 3D Gaussian Splatting worlds from text, images, or videos via upgraded panorama, planning, expansion, and composition modules, with released code claiming open-source SOTA performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11804","ref_index":52,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"OmniShow: Unifying Multimodal Conditions for Human-Object Interaction Video Generation","primary_cat":"cs.CV","submitted_at":"2026-04-13T17:59:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniShow unifies text, image, audio, and pose conditions into an end-to-end model for high-quality human-object interaction video generation and introduces the HOIVG-Bench benchmark, claiming state-of-the-art results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10098","ref_index":179,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-04-11T08:41:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The first survey on Attention Sink in Transformers structures the literature around fundamental utilization, mechanistic interpretation, and strategic mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07823","ref_index":60,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"LPM 1.0: Video-based Character Performance Model","primary_cat":"cs.CV","submitted_at":"2026-04-09T05:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LPM 1.0 generates infinite-length, identity-stable, real-time audio-visual conversational performances for single characters using a distilled causal diffusion transformer and a new benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03044","ref_index":11,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"JoyAI-LLM Flash: Advancing Mid-Scale LLMs with Token Efficiency","primary_cat":"cs.CL","submitted_at":"2026-04-03T13:52:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JoyAI-LLM Flash delivers a 48B MoE LLM with 2.7B active parameters per token via FiberPO RL and dense multi-token prediction, released with checkpoints on Hugging Face.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"As summarized in Table 1, JoyAI-LLM Flash is a Mixture-of-Experts (MoE) model with 48.9B total parameters, of which 3.28B are activated per token. Its micro-architecture draws inspiration from DeepSeek-V3 [8] and Kimi-K2 [9], utilizing Multi-head Latent Attention (MLA) [3] with hidden dimensions of 2048 and 768, respectively. The model incorporates standard components such as RMSNorm [10] for layer normalization, RoPE [11] for positional encoding, and SwiGLU [12] activation within the feed-forward blocks. In terms of macro-architecture, JoyAI-LLM Flash consists of 40 Transformer layers. The first layer utilizes a standard dense feed-forward network, while the remaining 39 layers are sparse MoE layers. The MoE module employs a fine-grained architecture with 256 total experts."},{"citing_arxiv_id":"2604.01178","ref_index":18,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Screening Is Enough","primary_cat":"cs.LG","submitted_at":"2026-04-01T17:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multiscreen replaces softmax attention with screening to provide absolute query-key relevance, resulting in models with 30% fewer parameters that maintain stable performance at long contexts.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"the influence of vector norms on these similarities, so that relevance depends only on directional alignment between queries and keys. Normalizing values prevents unusually large value norms from dominating the aggregation, thereby eliminating value-norm effects highlighted in prior analyses [32, 33]. Minimal positional encoding.To incorporate positional information, we introduceminimal posi- tional encoding(MiPE), a RoPE-like rotation [ 18] applied only to the first two coordinates of queries and keys, and activated only when the learned screening window is sufficiently small, where the rotation angle is adaptively controlled by the learned window parameter w. For a vector zi ∈R 1×dK at positioni, MiPE is defined as ˜zi =z iMi(w),(9) where Mi(w) = \u0012 R(ϕ(i, w)) 0 0I dK−2 \u0013 , R(ϕ) = \u0012"},{"citing_arxiv_id":"2603.17771","ref_index":35,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Attention Sinks Induce Gradient Sinks: Massive Activations as Gradient Regulators in Transformers","primary_cat":"cs.LG","submitted_at":"2026-03-18T14:31:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention sinks induce gradient sinks under causal masking, with massive activations serving as adaptive RMSNorm regulators that attenuate localized gradient pressure in Transformer training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.23516","ref_index":36,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MSA: Memory Sparse Attention for Efficient End-to-End Memory Model Scaling to 100M Tokens","primary_cat":"cs.CL","submitted_at":"2026-03-06T02:29:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSA is an end-to-end trainable memory model using sparse attention and document-wise RoPE that scales to 100M tokens with linear complexity and less than 9% degradation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.04791","ref_index":50,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Timer-S1: A Billion-Scale Time Series Foundation Model with Serial Scaling","primary_cat":"cs.AI","submitted_at":"2026-03-05T04:13:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Timer-S1 is a released 8.3B-parameter MoE time series model that achieves state-of-the-art MASE and CRPS scores on GIFT-Eval using serial scaling and Serial-Token Prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.13933","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HyMem: Hybrid Memory Architecture with Dynamic Retrieval Scheduling","primary_cat":"cs.AI","submitted_at":"2026-02-15T00:06:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HyMem introduces dual-granular memory storage with a lightweight summary module for fast responses and selective activation of a deep LLM module for complex queries, outperforming full-context baselines by 92.6% lower computational cost on LOCOMO and LongMemEval benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.00520","ref_index":33,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"NEST: Nested Event Stream Transformer for Sequences of Multisets","primary_cat":"cs.LG","submitted_at":"2026-01-31T05:21:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"NEST is a nested transformer for sequences of multisets that uses masked set modeling to learn improved set-level representations from hierarchical event streams like EHRs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.14004","ref_index":284,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Locate, Steer, and Improve: A Practical Survey of Actionable Mechanistic Interpretability in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-01-20T14:23:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The survey organizes mechanistic interpretability techniques into a Locate-Steer-Improve framework to enable actionable improvements in LLM alignment, capability, and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.05527","ref_index":45,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DeMa: Dual-Path Delay-Aware Mamba for Efficient Multivariate Time Series Analysis","primary_cat":"cs.LG","submitted_at":"2026-01-09T04:54:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeMa is a dual-path delay-aware Mamba architecture that decomposes MTS into intra-series temporal and inter-series variate paths to achieve SOTA performance with linear complexity on forecasting, imputation, anomaly detection, and classification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.00956","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RefTon: Reference person shot assist virtual Try-on","primary_cat":"cs.CV","submitted_at":"2025-11-02T14:32:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RefTon is a flux-based virtual try-on method that uses unpaired reference images of the target garment on different people to guide texture and detail preservation in a streamlined person-to-person pipeline without body parsing or masks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.26583","ref_index":85,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Emu3.5: Native Multimodal Models are World Learners","primary_cat":"cs.CV","submitted_at":"2025-10-30T15:11:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emu3.5 is a native multimodal world model pre-trained on over 10 trillion vision-language tokens with next-token prediction, post-trained via reinforcement learning, and accelerated by Discrete Diffusion Adaptation for efficient interleaved generation and world exploration.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"The attention mechanism employs 64 heads with 8 dedicated key-value heads, adopting Grouped Query Attention (GQA) [2] to improve efficiency. RMSNorm [123] with pre-normalization is used to stabilize training. We introduce QK-Norm [23] to the query and key projections to enhance attention stability. SwiGLU [78] is used as the activation function, and rotary positional embeddings (RoPE) [85] are employed. Overall, the model contains 34.1 billion(B) parameters, including 31.2 B in the transformer layers and 2.9 B in the embedding layers. The total vocabulary size is 282,926, consisting of 151,854 text tokens and 131,072 vision tokens. The text vocabulary directly reuses QwenTokenizer1, ensuring robust multilingual text coverage. The visual vocabulary is learned from"},{"citing_arxiv_id":"2510.18830","ref_index":29,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MTraining: Distributed Dynamic Sparse Attention for Efficient Ultra-Long Context Training","primary_cat":"cs.CL","submitted_at":"2025-10-21T17:25:32+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MTraining scales LLM training to 512K-token contexts on 32 A100 GPUs by integrating dynamic sparse training patterns with balanced and hierarchical sparse ring attention, achieving up to 6x throughput gains without accuracy loss on long-context benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.23951","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"HunyuanImage 3.0 Technical Report","primary_cat":"cs.CV","submitted_at":"2025-09-28T16:14:10+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.22186","ref_index":39,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MinerU2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing","primary_cat":"cs.CV","submitted_at":"2025-09-26T10:45:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MinerU2.5 uses a two-stage decoupled vision-language architecture to achieve state-of-the-art document parsing accuracy with lower computational overhead than existing general and domain-specific models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.02259","ref_index":11,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MemAgent: Reshaping Long-Context LLM with Multi-Conv RL-based Memory Agent","primary_cat":"cs.CL","submitted_at":"2025-07-03T03:11:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemAgent uses multi-conversation RL to train a memory agent that reads text in segments and overwrites memory, extrapolating from 8K training to 3.5M token QA with under 5% loss and 95%+ on 512K RULER.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.21996","ref_index":58,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"VRAG: Learning World Models for Interactive Video Generation","primary_cat":"cs.CV","submitted_at":"2025-05-28T05:55:44+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.16416","ref_index":19,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Circle-RoPE: Cone-like Decoupled Rotary Positional Embedding for Large Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2025-05-22T09:05:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Circle-RoPE achieves cross-modal positional disentanglement in VLMs by mapping 2D image tokens to a cone-like annulus orthogonal to the text axis, with PTD=0 eliminating RoPE geometric bias while preserving intra-image structure via alternating geometry encoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}