{"total":76,"items":[{"citing_arxiv_id":"2605.22939","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learnability-Informed Fine-Tuning of Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-21T18:16:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LIFT is a learnability-informed SFT algorithm for diffusion LMs that aligns token difficulty with diffusion time steps, yielding up to 3x gains on AIME'24 and AIME'25 over standard SFT baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22373","ref_index":28,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Boundary-targeted Membership Inference Attacks on Safety Classifiers","primary_cat":"cs.LG","submitted_at":"2026-05-21T12:05:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A boundary-targeted MIA strategy recovers 19% of distress-flagged conversations from a safety classifier at 5% false-positive rate, 3.5 times better than prior methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22263","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tailoring Teaching to Aptitude: Direction-Adaptive Self-Distillation for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASD improves math reasoning in LLMs by adaptively directing self-distillation based on per-token entropy to balance exploration and step accuracy, outperforming prior self-distillation and RLVR baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21468","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"You Only Need Minimal RLVR Training: Extrapolating LLMs via Rank-1 Trajectories","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:53:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RELEX extrapolates LLM checkpoints from short RLVR prefixes by projecting deltas onto a rank-1 subspace and fitting a linear trend, matching full training performance at 15% of the steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21467","ref_index":65,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DelTA: Discriminative Token Credit Assignment for Reinforcement Learning from Verifiable Rewards","primary_cat":"cs.LG","submitted_at":"2026-05-20T17:53:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DelTA estimates token coefficients to amplify discriminative directions in token-gradient vectors, reweighting the RLVR surrogate to produce more contrastive side-wise centroids and yielding 3.26 and 2.62 point gains on math benchmarks for 8B and 14B Qwen3 models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20613","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HRM-Text: Efficient Pretraining Beyond Scaling","primary_cat":"cs.CL","submitted_at":"2026-05-20T01:59:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A 1B-parameter hierarchical recurrent model pretrained on 40B instruction-response tokens achieves 60.7% MMLU and strong results on ARC-C, DROP, GSM8K, and MATH while using 100-900x fewer tokens than standard baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20382","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do as I Say, Not as I Do: Instruction-Induction Conflict in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-19T18:32:20+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Experiments reveal that LLMs follow instructions at rates from 1% to 99% when opposed by hardcoded conflicting patterns, with robustness tied to output diversity and alignment with model priors rather than general capability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19815","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LP-Eval: Rubric and Dataset for Measuring the Quality of Legal Proposition Generation","primary_cat":"cs.CL","submitted_at":"2026-05-19T13:10:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LP-Eval is a new expert-co-designed rubric and annotated dataset showing that LLMs mostly produce well-formed legal propositions from EU court decisions, with higher expert-rated quality for established cases and improved LLM-as-judge alignment when using the rubric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19377","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Evaluation Game: Beyond Static LLM Benchmarking","primary_cat":"cs.LG","submitted_at":"2026-05-19T05:22:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents a game-theoretic model with group actions for data augmentation in LLM adversarial evaluation, demonstrating local generalization from fine-tuning on three model families and redefining benchmarks as orbits under group actions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20258","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"It Takes Two: Complementary Self-Distillation for Contextual Integrity in LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-18T13:57:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SELFCI uses complementary self-distillation with two reverse KL divergences to align LLMs to contextual integrity while preserving utility, outperforming RL baselines like GRPO in agentic settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17000","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BoLT: A Benchmark to Democratize Black-box Optimization Research for Expensive LLM Tasks","primary_cat":"cs.LG","submitted_at":"2026-05-16T13:53:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"BoLT is a benchmark of surrogate models fitted to real LLM experiment data that enables evaluation of Bayesian and black-box optimization methods on multi-fidelity, multi-objective, high-dimensional LLM tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16928","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Full Attention Strikes Back: Transferring Full Attention into Sparse within Hundred Training Steps","primary_cat":"cs.CL","submitted_at":"2026-05-16T10:51:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RTPurbo exploits intrinsic sparsity in full-attention LLMs to achieve near-lossless sparse inference after only a few hundred training steps via retrieval-head identification and a lightweight token indexer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15726","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR","primary_cat":"cs.AI","submitted_at":"2026-05-15T08:22:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NudgeRL conditions RLVR rollouts on strategy-level contexts to drive diverse trajectories and applies an inter/intra-context reward decomposition plus distillation objective, outperforming GRPO and oracle baselines on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15172","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MetaBackdoor: Exploiting Positional Encoding as a Backdoor Attack Surface in LLMs","primary_cat":"cs.CR","submitted_at":"2026-05-14T17:56:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MetaBackdoor shows that LLMs can be backdoored using positional triggers like sequence length, enabling stealthy activation on clean inputs to leak system prompts or trigger malicious behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15113","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from Language Feedback via Variational Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:27:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VPD frames language feedback learning as variational EM so the teacher policy refines itself via trust-region updates on outcomes while the student learns dense token distributions on its own rollouts, outperforming fixed-teacher baselines on reasoning and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14258","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamics of the Transformer Residual Stream: Coupling Spectral Geometry to Network Topology","primary_cat":"cs.LG","submitted_at":"2026-05-14T01:57:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Training installs a depth-dependent spectral gradient and low-rank bottleneck in LLM residual streams whose amplification or suppression of graph communities is predicted by local operator type.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13329","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tracing Persona Vectors Through LLM Pretraining","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:44:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Persona vectors form within the first 0.22% of LLM pretraining and remain effective for steering post-trained models, with continued refinement and transfer to other models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13045","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Large Language Models Lack Temporal Awareness of Medical Knowledge","primary_cat":"cs.LG","submitted_at":"2026-05-13T06:04:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LLMs lack temporal awareness of medical knowledge, showing gradual performance decline on up-to-date facts, much lower accuracy on historical knowledge (25-54% relative), and inconsistent year-to-year predictions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12888","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seed Bank, Co-op, Stoop Swap: Metaphors for Governing Language Model Data for Creative Writing","primary_cat":"cs.HC","submitted_at":"2026-05-13T02:04:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Workshops with over 100 creative writers produced metaphors and four themes for language model governance that favor consent-driven, smaller open models encoding community values.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"reflects the voice or values of a particular community [20, 67]. Given that small communities likely mean small datasets, future work should also ask how much and what type of data will shift model properties. Work on fine-tuning LLMs with limited data could be extended to artistic settings [14, 58]. Open-source and nonprofit models with more transparent data practices may provide useful foundations [48], but writers still need ways to understand tradeoffs between size and capabilities. Alternatives and tradeoffs.Writers also do not need to wait to start experimenting; small groups can already begin experimenting with datasets, continuing a longer history of writers building and training their own models [54]. At the same time, we acknowledge meaningful participation requires some level of understanding of"},{"citing_arxiv_id":"2605.12798","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Emergent and Subliminal Misalignment Through the Lens of Data-Mediated Transfer","primary_cat":"cs.LG","submitted_at":"2026-05-12T22:27:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Emergent and subliminal misalignment in LLMs arise from data structure interactions and transfer via benign distillation data, with stronger effects under shared functional structure and on-policy settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12726","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Before the Last Token: Diagnosing Final-Token Safety Probe Failures","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:30:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Final-token probes miss distributed unsafe evidence in jailbreaks, but a PCA-HMM model on prefill trajectories recovers many misses without naive pooling's false positives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12715","ref_index":25,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Laws for Mixture Pretraining Under Data Constraints","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:22:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical study shows mixture pretraining tolerates higher target data repetition than single-source training, with a new repetition-aware scaling law enabling principled mixture selection based on data size, compute, and model scale.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12705","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Early Data Exposure Improves Robustness to Subsequent Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:08:00+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Early mixing of post-training data into pretraining improves retention of acquired capabilities after subsequent fine-tuning in language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12382","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pretraining Exposure Explains Popularity Judgments in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T16:45:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"LLM popularity judgments align more closely with pretraining data exposure counts than with Wikipedia popularity, with stronger effects in pairwise comparisons and larger models.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"336 0.393 0.450 0.602 0.696 0.427 0.389 0.434 0.509 seven days. Approximately 100 TB of storage is used to store the raw corpora, indexed shards, Wikidata dumps, and auxiliary files. Model inference experiments are conducted on four GPU nodes, each containing four NVIDIA H100 GPUs with 94 GB of mem- ory. All experiments are implemented using PyTorch [15] and the Transformers library [27]. Two OLMo model variants are evaluated: OLMo-3-7B12 and OLMo-3.1-32B13, both trained on the indexed corpora. Other LLMs are not considered, as their pretraining data are usually not publicly available, preventing direct measurement of pretraining exposure. 5 Results and Analysis Table 1 reports Spearman's rank correlations [22] between each"},{"citing_arxiv_id":"2605.12227","ref_index":79,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Combining On-Policy Optimization and Distillation for Long-Context Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-12T15:04:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"dGRPO merges outcome-based policy optimization with dense teacher guidance from on-policy distillation, yielding more stable long-context reasoning on the new LongBlocks synthetic dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11609","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anti-Self-Distillation for Reasoning RL via Pointwise Mutual Information","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anti-Self-Distillation reverses self-distillation signals via PMI to fix overconfidence on structural tokens, matching GRPO baseline accuracy 2-10x faster with up to 11.5 point gains across 4B-30B models.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"to its pre-collapse baselineH warm (a Schmitt trigger to avoid chatter): g←    1ifg= 0andH≥H warm, 0ifg= 1andH < τ down, gotherwise, λ=g·λ max.(7) τdown is auto-calibrated from W warmup steps at λ= 0 (concrete values in Section 4 Setup). Algorithm 1 (Appendix B) summarizes the resulting update. 4 Experiments Setup.We train five language models from the Qwen3 [ 29] and Olmo-3 [ 20] families (4B-30B parameters) on DAPO-Math-17k [32] for 200 on-policy steps, comparing four conditions per model: the un-trained base, +GRPO (Equation (2) with λ= 0 ), +SD (default self-distillation, δt = +ut), and +AntiSD (Algorithm 1). The privileged context c is a verified solution sampled from the rollout group when at least one rollout is correct, else from the dataset, concatenated with a binary"},{"citing_arxiv_id":"2605.11388","ref_index":145,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Reasoning in General Purpose Agents via Structured Meta-Cognition","primary_cat":"cs.CL","submitted_at":"2026-05-12T01:21:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DOLORES, an agent using a formal language for meta-reasoning to construct adaptive scaffolds on the fly, outperforms prior scaffolding methods by 24.8% on average across four hard benchmarks and multiple model sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10876","ref_index":88,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AssayBench: An Assay-Level Virtual Cell Benchmark for LLMs and Agents","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:27:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AssayBench is a new gene-ranking benchmark for phenotypic CRISPR screens that shows zero-shot generalist LLMs outperform both biology-specific LLMs and trainable baselines on adjusted nDCG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10805","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reasoning Is Not Free: Robust Adaptive Cost-Efficient Routing for LLM-as-a-Judge","primary_cat":"cs.AI","submitted_at":"2026-05-11T16:30:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RACER routes between reasoning and non-reasoning LLM judges via constrained distributionally robust optimization to achieve better accuracy-cost trade-offs under distribution shift.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10414","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Remember to Forget: Gated Adaptive Positional Encoding","primary_cat":"cs.LG","submitted_at":"2026-05-11T11:52:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GAPE augments RoPE with query- and key-dependent gates to stabilize attention and improve long-context performance in language models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"RoPE decomposes queries and keys into two-dimensional chunks, rotating each at a different frequencyg k ∈G, ranging fromg 1 = 1radian per token (highest frequency) tog d/2 ≈1/θradians per token (lowest), whereθis the base wavelength, defaulting to10,000[24]. Long-context extrapolation and the base-scaling deadlock.A natural response to RoPE's ex- trapolation failures is to scale θ. Position Interpolation [2], YaRN [19], and LongRoPE [8] remap rotary frequencies to reduce OOD phase angles at extended lengths. However, recent theoretical analyzes reveal that this exposes aninterpolation-extrapolation deadlock[13, 15, 29]: shrinking θ smooths extrapolation but harms long-range semantic discrimination, while inflating θ preserves local interpolation but devolves low-frequency channels into near-identity maps, ultimately colliding"},{"citing_arxiv_id":"2605.08472","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mid-Training with Self-Generated Data Improves Reinforcement Learning in Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-08T20:46:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mid-training LLMs on self-generated diverse reasoning paths improves subsequent RL performance on mathematical benchmarks and OOD tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"and matches or exceeds PPO-based methods, with several variants further modifying the training objective [25, 43, 1, 30]. Reinforcement Learning with Verifiable Rewards (RLVR) [26, 16, 12] has since driven substantial gains on challenging reasoning tasks. However, RL is more effective when applied to base models with strong priors [ 42, 9, 62], and several works [ 36] distill task-specific reasoning data before RL. Yuan et al. [59] further show that RL enables models to learn novel 2 compositions of atomic skills not explicitly taught during prior training. Our work differs by mid- training on diverse self-generated responses produced via Pólya-style problem-solving heuristics, and theoretically analyzing how subsequent RL combines these approaches."},{"citing_arxiv_id":"2605.07865","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"KL for a KL: On-Policy Distillation with Control Variate Baseline","primary_cat":"cs.LG","submitted_at":"2026-05-08T15:24:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"vOPD stabilizes on-policy distillation gradients by subtracting a closed-form per-token negative reverse KL baseline as a detached control variate, preserving unbiasedness while lowering variance and matching expensive full-vocabulary methods.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Rather than relying on a sparse terminal reward, OPD minimizes the reverse KL divergence between the student and the teacher via dense, token-level signals, enabling faster training [21]. Because it is on-policy and reward-driven, OPD can naturally be implemented using standard RL pipelines with a single-sample Monte Carlo estimator [ 23], and empirically matches RLVR accuracy with a fraction of the compute [ 29]. Its effectiveness has been demonstrated in industrial-level post-training such as Qwen3, GLM-5, Nemotron-Cascade2, and DeepSeek-V4 [4, 43, 44, 46]. Despite this success, OPD's optimization recipe remains underdeveloped: training is unstable in practice, and stabilization techniques are still immature relative to the recipes that drive successful"},{"citing_arxiv_id":"2605.07632","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-training makes large language models less human-like","primary_cat":"cs.CL","submitted_at":"2026-05-08T11:59:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Post-training reduces LLMs' behavioral alignment with humans across families and sizes, with the misalignment increasing in newer generations while persona induction fails to improve individual-level predictions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Rogers, M. Strohmaier, The prompt makes the person (a): A systematic evaluation of sociodemographic persona prompting for large language models. arXiv preprint arXiv:2507.16076(2025). 16 34. V. Cheung, M. Maier, F. Lieder, Large language models show amplified cognitive biases in moral decision-making.Proceedings of the National Academy of Sciences122(25), e2412015122 (2025). 35. E. Shapira, M. Tennenholtz, R. Reichart, Alignment Makes Language Models Normative, Not Descriptive.arXiv preprint arXiv:2603.17218(2026). 36. A. Reinhart,et al., Do LLMs write like humans? Variation in grammatical and rhetorical styles. Proceedings of the National Academy of Sciences122(8), e2422455122 (2025). 37. T. Kuribayashi, Y."},{"citing_arxiv_id":"2605.07315","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LaTER: Efficient Test-Time Reasoning via Latent Exploration and Explicit Verification","primary_cat":"cs.CL","submitted_at":"2026-05-08T06:23:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LaTER reduces LLM token usage 16-33% on reasoning benchmarks by exploring in latent space then switching to explicit CoT verification, with gains like 70% to 73.3% on AIME 2025 in the training-free version.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"We compare standard discrete CoT decoding with training-free LaTER under the same prompts and decoding settings. We report accuracy and total token usage. For LaTER, token usage counts both latent steps and emitted explicit tokens, so reductions are not an artifact of ignoring latent computation. We evaluate Qwen3-14B, DeepSeek-R1-Distill-Llama-8B, and OLMo3-32B-Think on AIME 2025 [15], MATH-500 [16], GSM8K [17], GPQA [18], ARC-Challenge [19], HumanEval+, and MBPP+ [20, 21]. 2.4 Fixed-steps switching results For Qwen3-14B, we follow the official decoding recommendations: temperature= 0.6, top-p= 0.95 , top-k= 20 , and max_new_tokens= 38192. Under this setup, the standard discrete CoT baseline reaches 70.0% accuracy on AIME 2025 with roughly 16K tokens on average."},{"citing_arxiv_id":"2605.07307","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Dense Sequential Chains: Reasoning Language Models Can Extract Answers from Sparse, Order-Shuffling Chain-of-Thoughts","primary_cat":"cs.CL","submitted_at":"2026-05-08T06:15:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Reasoning language models extract answers from sparse, order-shuffled chain-of-thought traces with little accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06901","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reflections and New Directions for Human-Centered Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-07T20:02:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Model developers must address human concerns, preferences, values, and goals with rigor at every stage of the LLM pipeline rather than only in post-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06327","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Measuring Evaluation-Context Divergence in Open-Weight LLMs: A Paired-Prompt Protocol with Pilot Evidence of Alignment-Pipeline-Specific Heterogeneity","primary_cat":"cs.CL","submitted_at":"2026-05-07T14:23:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new paired-prompt protocol reveals alignment-pipeline-specific heterogeneity in how open-weight LLMs respond to evaluation versus deployment framings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05365","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ZAYA1-8B Technical Report","primary_cat":"cs.AI","submitted_at":"2026-05-06T18:44:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZAYA1-8B is a reasoning MoE model with 700M active parameters that matches larger models on math and coding benchmarks and reaches 91.9% on AIME'25 via Markovian RSA test-time compute.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"ZAY A1-8B is shown at 0.7B active parameters and compared against larger open-weight and frontier models where available. Bubble area denotes total parameter count where available. II. MODEL A. Architecture ZAY A1-8B uses an MoE architecture with three changes relative to contemporary MoE models: (1) CCA for the atten- tion block, (2) the ZAY A1 router, and (3) residual scaling. In our ablations, these changes improve per-parameter perplexity relative to classical MoE architectures (Shazeer et al., 2016; Fedus et al., 2022) using MLA or GQA attention and a linear router (Dai et al., 2024). CCA also improves training speed relative to GQA and MLA and reduces prefill FLOPs while maintaining comparable KV-cache compression rates."},{"citing_arxiv_id":"2605.05197","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Implicit Representations of Grammaticality in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-06T17:57:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Linear probes on LM hidden states detect grammaticality better than string probabilities, generalize to human benchmarks and other languages, and correlate weakly with likelihood.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01640","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prescriptive Scaling Laws for Data Constrained Training","primary_cat":"cs.LG","submitted_at":"2026-05-02T23:14:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A one-parameter scaling law models excess loss from data repetition as an additive overfitting penalty, recommending model capacity increases over excessive repetition and showing that strong weight decay reduces the penalty coefficient by ~70%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01158","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Hidden Cost of Thinking: Energy Use and Environmental Impact of LMs Beyond Pretraining","primary_cat":"cs.CY","submitted_at":"2026-05-01T23:24:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Full development of 7B and 32B Olmo 3 models used 12.3 GWh datacenter energy and emitted 4,251 tCO2eq, with development overheads accounting for 82% of compute and reasoning models costing 17x more to post-train than instruction-tuned ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00817","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When LLMs Stop Following Steps: A Diagnostic Study of Procedural Execution in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-01T17:55:47+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27251","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Compliance versus Sensibility: On the Reasoning Controllability in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-29T22:55:40+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"3 to determine whether the responses are compliant. In order to make sure that only the reasoning compliance is contrasted, we marginalize the effects of final answer correctness and instructed type by only pairing those responses with the same correctness and instructed type. 4 Experimental Setup 4.1 Models We use three family of open LLMs: OLMO3.1-32B-IT, OLMO3-7B-IT(Olmo, 2025), LLAMA3.3-70B-IT, LLAMA3.1-8B-IT(AI@Meta, 2024), QWEN3-32B, and QWEN3-8B. We also employ two frontier proprietary models: GPT-5.1 (OpenAI, 2025) and GEMINI3-FLASH (Gemini, 2025). 4.2 Data We evaluate the LLMs across four datasets specifically selected to represent the three fundamental reasoning types: deduction, induction, and abduction. FOLIO(Deduction): A first-order logic dataset containing textual premises and claims"},{"citing_arxiv_id":"2604.25872","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Errors Can Be Beneficial: A Categorization of Imperfect Rewards for Policy Gradient","primary_cat":"cs.LG","submitted_at":"2026-04-28T17:10:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Certain errors in proxy rewards for policy gradient methods can be benign or beneficial by preventing policies from stalling on outputs with mediocre ground truth rewards, enabling improved RLHF metrics and reward design insights.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Based on our categorization of reward errors, we develop ranking accuracy variants that account for the harmfulness of an incorrect ranking to policy gradient optimization. Experiments show that these harm-aware metrics typically correlate better with the performance of a language model after RLHF, across datasets and model families (Llama [24], OLMo [57], and Qwen [95]). Yet, despite these gains, the correlation can still be weak, showcasing challenges in robustly evaluating reward models. Beyond reward model evaluation, we explore implications of our theory for reward design in settings with verifiable rule-based rewards (Section 5). Namely, we demonstrate that rewarding partially correct outputs (cf."},{"citing_arxiv_id":"2604.22709","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking Without Words: Efficient Latent Reasoning with Abstract Chain-of-Thought","primary_cat":"cs.CL","submitted_at":"2026-04-24T16:45:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Abstract-CoT lets models reason with short discrete latent token sequences from a reserved vocabulary, using warm-up training and RL to match verbal CoT performance with up to 11.6x fewer tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22167","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Estimating Tail Risks in Language Model Output Distributions","primary_cat":"cs.LG","submitted_at":"2026-04-24T02:30:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Importance sampling with unsafe model variants estimates tail probabilities of harmful language model outputs using 10-20x fewer samples than brute-force Monte Carlo.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"There are several possible definitions of risk over unseen queries that can be estimated using query-level bQRISK (ci). We consider the following measure of risk: If we seennew queries at deployment, what is the probability that there will be a worst-query probability above a thresholdτ? We can define this formally as: PDquery \u0014 max 1≤i≤n QRISK (ci)> τ \u0015 .(15) Note that: PDquery \u0014 max 1≤i≤n QRISK (ci)> τ \u0015 = 1−P Dquery \u0014 max 1≤i≤n QRISK (ci)≤τ \u0015 ,reversing the inequality (16) = 1−P Dquery [∀i≤n,Q RISK (ci)≤τ]as allQ RISK (ci)are bounded their max (17) = 1− nY i=1 PDquery [QRISK (ci)≤τ],asc i are sampled independently (18) = 1− \u0002 PDquery [QRISK (ci)≤τ] \u0003n ,asc i are identically distributed (19) where the last termP Dquery [QRISK (ci)≤τ]is the cumulative density function of the random variableQ RISK (ci)where"},{"citing_arxiv_id":"2604.21637","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multilinguality at the Edge: Developing Language Models for the Global South","primary_cat":"cs.CL","submitted_at":"2026-04-23T12:53:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of 232 papers on the intersection of multilingual language modeling and edge deployment identifies the 'last mile' challenge for Global South communities and offers recommendations for more inclusive NLP.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21254","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hyperloop Transformers","primary_cat":"cs.LG","submitted_at":"2026-04-23T03:46:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Hyperloop Transformers outperform standard and mHC Transformers with roughly 50% fewer parameters by looping a middle block of layers and applying hyper-connections only after each loop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19295","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TEMPO: Scaling Test-time Training for Large Reasoning Models","primary_cat":"cs.LG","submitted_at":"2026-04-21T10:01:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TEMPO scales test-time training for large reasoning models by interleaving policy refinement on unlabeled data with critic recalibration on labeled data via an EM formulation, yielding large gains on AIME tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18473","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Train Separately, Merge Together: Modular Post-Training with Mixture-of-Experts","primary_cat":"cs.LG","submitted_at":"2026-04-20T16:24:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BAR trains independent domain experts via separate mid-training, SFT, and RL pipelines then composes them with a MoE router to match monolithic retraining performance at lower cost and without catastrophic forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}