{"total":19,"items":[{"citing_arxiv_id":"2605.22679","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Conceptualizing Embeddings: Sparse Disentanglement for Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-21T16:23:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CEDAR learns an invertible rotation of vision-language embeddings to concentrate semantics into sparse, axis-aligned coordinates for improved interpretability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13026","ref_index":55,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Understanding and Accelerating the Training of Masked Diffusion Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-13T05:29:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Bell-shaped time sampling accelerates masked diffusion language model training by roughly 4x on LM1B by countering locality bias in language data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16509","ref_index":19,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Learning-Based Sparsification of Dynamic Graphs in Robotic Exploration Algorithms","primary_cat":"cs.RO","submitted_at":"2026-04-15T03:39:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A PPO-trained transformer policy sparsifies dynamic graphs during RRT frontier exploration, cutting size by up to 96% and yielding the most consistent exploration rates across environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12946","ref_index":63,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Parcae: Scaling Laws For Stable Looped Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-14T16:43:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Parcae stabilizes looped LLMs via spectral norm constraints on injection parameters, enabling power-law scaling for training FLOPs and saturating exponential scaling at test time that improves quality over fixed-depth baselines under fixed parameter budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06427","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"The Depth Ceiling: On the Limits of Large Language Models in Discovering Latent Planning","primary_cat":"cs.LG","submitted_at":"2026-04-07T20:04:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs discover latent planning strategies up to five steps during training and execute them up to eight steps at test time, with larger models reaching seven under few-shot prompting, revealing a dissociation between discovery and execution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.17771","ref_index":32,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Attention Sinks Induce Gradient Sinks: Massive Activations as Gradient Regulators in Transformers","primary_cat":"cs.LG","submitted_at":"2026-03-18T14:31:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention sinks induce gradient sinks under causal masking, with massive activations serving as adaptive RMSNorm regulators that attenuate localized gradient pressure in Transformer training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.07529","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Obliviator Reveals the Cost of Nonlinear Guardedness in Concept Erasure","primary_cat":"cs.LG","submitted_at":"2026-03-08T08:35:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Obliviator introduces an iterative kernel-based optimization for nonlinear concept erasure that quantifies the utility cost of guarding against nonlinear adversaries and outperforms prior methods on trade-off curves.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.05387","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"The Generalization Ridge: Information Flow in Natural Language Generation","primary_cat":"cs.CL","submitted_at":"2025-07-07T18:18:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InfoRidge reveals a non-monotonic pattern in which predictive mutual information between hidden states and outputs peaks in intermediate layers before declining in final layers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.09747","ref_index":55,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","primary_cat":"cs.RO","submitted_at":"2025-01-16T18:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FAST applies discrete cosine transform to robot action sequences for efficient tokenization, enabling autoregressive VLAs to succeed on high-frequency dexterous tasks and scale to 10k hours of data while matching diffusion VLA performance with up to 5x faster training.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Internet-scale image-text corpora [10, 39, 63, 7, 65]. How- ever, these models require choosing a tokenization of the continuous action signal, which determines how the discrete symbols predicted by the model map to continuous robot actions [64, 34, 41, 12]. It is widely known that a good choice of tokenization can be critical to the performance of sequence models [55, 57]. Prior robotic policies of this sort typically use na ¨ıve tokenization strategies based on a per-dimension, per-timestep binning scheme [9, 10, 39]. We find that such methods perform poorly when learning dexterous skills with high-frequency control (see Figure 2, right). We observe that correlations between time steps are a major challenge for"},{"citing_arxiv_id":"2412.12636","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TrainMover: An Interruption-Resilient Runtime for ML Training","primary_cat":"cs.DC","submitted_at":"2024-12-17T07:59:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TrainMover achieves ~20s downtime for interruptions in 1024-GPU LLM training via two-phase delta-based communication setup, communication-free sandboxed warmup, and general standby design, projecting 55% reduction in wasted GPU hours.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.17891","ref_index":166,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Scaling Diffusion Language Models via Adaptation from Autoregressive Models","primary_cat":"cs.CL","submitted_at":"2024-10-23T14:04:22+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Adapting autoregressive models via continual pre-training yields diffusion language models from 127M to 7B parameters that outperform prior diffusion models and compete with their autoregressive counterparts on language, reasoning, and commonsense benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2407.21787","ref_index":50,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Large Language Monkeys: Scaling Inference Compute with Repeated Sampling","primary_cat":"cs.LG","submitted_at":"2024-07-31T17:57:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Repeated sampling scales problem coverage log-linearly with sample count, improving SWE-bench Lite performance from 15.9% to 56% using 250 samples.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, and Gabriel Synnaeve. Code llama: Open foundation models for code, 2023. URL https://arxiv.org/abs/2308. 12950. [49] Rulin Shao, Jacqueline He, Akari Asai, Weijia Shi, Tim Dettmers, Sewon Min, Luke Zettlemoyer, and Pang Wei Koh. Scaling retrieval-based language models with a trillion-token datastore, 2024. URL https://arxiv.org/abs/2407.12854. [50] David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, Timothy Lillicrap, Karen Simonyan, and Demis Hassabis. Mastering chess and shogi by self-play with a general reinforcement learning algorithm, 2017. [51] Yifan Song, Guoyin Wang, Sujian Li, and Bill Yuchen Lin."},{"citing_arxiv_id":"2407.16216","ref_index":67,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Reinforcement Learning for LLM Post-Training: A Survey","primary_cat":"cs.CL","submitted_at":"2024-07-23T06:45:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey deriving a unified policy gradient framework for LLM post-training methods and providing technical comparisons of PPO, GRPO, DPO variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.17557","ref_index":20,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale","primary_cat":"cs.CL","submitted_at":"2024-06-25T13:50:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FineWeb is a curated 15T-token web dataset that produces stronger LLMs than prior open collections, while its educational subset sharply improves performance on MMLU and ARC benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"subset that uses fastText for language classification, heuristic rules from MassiveText and C4 for quality filtering, rules- and classifier-based toxicity filtering, and URL, document and paragraph-level deduplication using a Bloom filter. Apart from public datasets, the technical reports accompanying the announcement of closed LLMs occasionally discuss pretraining datasets. WebText[20] (used to train GPT-2) involves only those non-Wikipedia webpages that were linked to from Reddit posts with at least 3 karma, with text extracted using Dragnet [39] and Newspaper1 [40] and an unspecified deduplication pipeline. GPT- 3's Dataset[3] includes content from Common Crawl that has been filtered using a classifier trained on WebText, Wikipedia, and Books, and deduplicated using MinHash."},{"citing_arxiv_id":"2309.14509","ref_index":31,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models","primary_cat":"cs.LG","submitted_at":"2023-09-25T20:15:57+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepSpeed-Ulysses keeps communication volume constant for sequence-parallel attention when sequence length and device count scale together, delivering 2.5x faster training on 4x longer sequences than prior SOTA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2304.10573","ref_index":40,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"IDQL: Implicit Q-Learning as an Actor-Critic Method with Diffusion Policies","primary_cat":"cs.LG","submitted_at":"2023-04-20T18:04:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"IDQL generalizes IQL into an actor-critic framework and uses diffusion policies for robust policy extraction, outperforming prior offline RL methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Diffusion models have also been used in behavioral cloning and ofﬂine RL. Florence et al. [10] and Pearce et al. [36] use energy-based models and diffusion models, respectively, for behavioral cloning. Janner et al. [22] and Ajay et al. [1] use diffusion to directly model and sample the trajectory space; samples are guided with gradient guidance or reward conditioning. Reuss et al. [40] uses diffusion policies for goal-conditioned imitation learning. Closest to our work are prior methods that represent the actor with a diffusion model in ofﬂine RL. Diffusion Q-learning (DQL) [44] incorporates diffusion to parameterize the actor in a TD3+BC-style algorithm [12]. Select from Behavior Candidates (SfBC) [ 7] uses importance reweighting from a"},{"citing_arxiv_id":"2303.11156","ref_index":42,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Can AI-Generated Text be Reliably Detected?","primary_cat":"cs.CL","submitted_at":"2023-03-17T17:53:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Recursive paraphrasing attacks substantially lower detection rates for multiple AI text detectors with only minor quality loss, while a theoretical analysis ties best-case AUROC to total variation distance between human and AI distributions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.00593","ref_index":18,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Interpretability in the Wild: a Circuit for Indirect Object Identification in GPT-2 small","primary_cat":"cs.LG","submitted_at":"2022-11-01T17:08:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"GPT-2 small solves indirect object identification via a circuit of 26 attention heads organized into seven functional classes discovered through causal interventions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2201.11990","ref_index":52,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model","primary_cat":"cs.CL","submitted_at":"2022-01-28T08:59:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Trained the largest monolithic 530B-parameter transformer language model to date and reported new state-of-the-art zero- and few-shot results on multiple NLP benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}