{"total":20,"items":[{"citing_arxiv_id":"2605.11007","ref_index":148,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RT-Transformer: The Transformer Block as a Spherical State Estimator","primary_cat":"cs.LG","submitted_at":"2026-05-10T08:14:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Transformer components arise as the natural solution to precision-weighted directional state estimation on the hypersphere.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04901","ref_index":47,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"On the (In-)Security of the Shuffling Defense in the Transformer Secure Inference","primary_cat":"cs.CR","submitted_at":"2026-05-06T13:31:15+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An attack aligns differently shuffled intermediate activations from secure Transformer inference queries to recover model weights with low error using roughly one dollar of queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03413","ref_index":215,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning to Theorize the World from Observation","primary_cat":"cs.LG","submitted_at":"2026-05-05T06:39:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NEO induces compositional latent programs as world theories from observations and executes them to enable explanation-driven generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01517","ref_index":245,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"VAnim: Rendering-Aware Sparse State Modeling for Structure-Preserving Vector Animation","primary_cat":"cs.CV","submitted_at":"2026-05-02T16:10:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VAnim creates open-domain text-to-SVG animations via sparse state updates on a persistent DOM tree, identification-first planning, and rendering-aware RL with a new 134k-example benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00604","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Affinity Is Not Enough: Recovering the Free Energy Principle in Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-05-01T12:18:55+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Adding temporal memory via LIF, precision-weighted gating, and anticipatory prediction to MoE routers recovers effective expert selection at distribution transitions, with ablation confirming a super-additive beta-ant interaction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21215","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Recurrent Transformer: Greater Effective Depth and Efficient Decoding","primary_cat":"cs.LG","submitted_at":"2026-04-23T02:12:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Recurrent Transformers add per-layer recurrent memory via self-attention on own activations plus a tiling algorithm that reduces training memory traffic, yielding better C4 pretraining cross-entropy than parameter-matched standard transformers with fewer layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17384","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Towards a Data-Parameter Correspondence for LLMs: A Preliminary Discussion","primary_cat":"cs.LG","submitted_at":"2026-04-19T11:18:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A data-parameter correspondence unifies data-centric and parameter-centric LLM optimizations as dual geometric operations on the statistical manifold via Fisher-Rao metric and Legendre duality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16509","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Learning-Based Sparsification of Dynamic Graphs in Robotic Exploration Algorithms","primary_cat":"cs.RO","submitted_at":"2026-04-15T03:39:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A PPO-trained transformer policy sparsifies dynamic graphs during RRT frontier exploration, cutting size by up to 96% and yielding the most consistent exploration rates across environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.03263","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LPC-SM: Local Predictive Coding and Sparse Memory for Long-Context Language Modeling","primary_cat":"cs.CL","submitted_at":"2026-03-12T21:21:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LPC-SM is a hybrid architecture separating local attention, persistent memory, predictive correction, and control with ONT for memory writes, showing loss reductions on 158M-parameter models up to 4096-token contexts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.04434","ref_index":160,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","primary_cat":"cs.CL","submitted_at":"2024-05-07T15:56:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepSeek-V2 delivers top-tier open-source LLM performance using only 21B active parameters by compressing the KV cache 93.3% and cutting training costs 42.5% via MLA and DeepSeekMoE.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.18416","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Capabilities of Gemini Models in Medicine","primary_cat":"cs.AI","submitted_at":"2024-04-29T04:11:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Med-Gemini sets new records on 10 of 14 medical benchmarks including 91.1% on MedQA-USMLE, beats GPT-4V by 44.5% on multimodal tasks, and surpasses humans on medical text summarization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.02954","ref_index":162,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","primary_cat":"cs.CL","submitted_at":"2024-01-05T18:59:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DeepSeek LLM 67B exceeds LLaMA-2 70B on code, mathematics and reasoning benchmarks after pre-training on 2 trillion tokens and alignment via SFT and DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.08560","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MemGPT: Towards LLMs as Operating Systems","primary_cat":"cs.AI","submitted_at":"2023-10-12T17:51:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemGPT uses OS-inspired virtual context management to extend LLM context windows for large document analysis and long-term multi-session chat.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.16797","ref_index":85,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Promptbreeder: Self-Referential Self-Improvement Via Prompt Evolution","primary_cat":"cs.CL","submitted_at":"2023-09-28T19:01:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Promptbreeder evolves both task prompts and the mutation prompts that improve them using LLMs, outperforming Chain-of-Thought and Plan-and-Solve on arithmetic and commonsense reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.16199","ref_index":103,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention","primary_cat":"cs.CV","submitted_at":"2023-03-28T17:59:12+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLaMA-Adapter turns frozen LLaMA 7B into a capable instruction follower using only 1.2M new parameters and zero-init attention, matching Alpaca while extending to image-conditioned reasoning on ScienceQA and COCO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2205.01068","ref_index":119,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OPT: Open Pre-trained Transformer Language Models","primary_cat":"cs.CL","submitted_at":"2022-05-02T17:49:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPT releases open decoder-only transformers up to 175B parameters that match GPT-3 performance at one-seventh the carbon cost, along with code and training logs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2101.00027","ref_index":141,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Pile: An 800GB Dataset of Diverse Text for Language Modeling","primary_cat":"cs.CL","submitted_at":"2020-12-31T19:00:10+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"The Pile is a newly constructed 825 GiB dataset from 22 diverse sources that enables language models to achieve better performance on academic, professional, and cross-domain tasks than models trained on Common Crawl variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1910.03771","ref_index":151,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HuggingFace's Transformers: State-of-the-art Natural Language Processing","primary_cat":"cs.CL","submitted_at":"2019-10-09T03:23:22+00:00","verdict":"ACCEPT","verdict_confidence":"HIGH","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hugging Face releases an open-source Python library that supplies a unified API and pretrained weights for major Transformer architectures used in natural language processing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1909.11942","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations","primary_cat":"cs.CL","submitted_at":"2019-09-26T07:06:13+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ALBERT reduces BERT parameters via embedding factorization and layer sharing, adds inter-sentence coherence pretraining, and reaches SOTA on GLUE, RACE, and SQuAD with fewer parameters than BERT-large.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"1909.08053","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","primary_cat":"cs.CL","submitted_at":"2019-09-17T19:42:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Intra-layer model parallelism in PyTorch enables training of 8.3B-parameter transformers, achieving SOTA perplexity of 10.8 on WikiText103 and 66.5% accuracy on LAMBADA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}