{"total":14,"items":[{"citing_arxiv_id":"2606.12273","ref_index":53,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond Fully Random Masking: Attention-Guided Denoising and Optimization for Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-10T16:14:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AGDO improves dLLM reasoning performance by determining denoising order and emphasizing tokens based on attention-derived dependencies rather than random masking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11470","ref_index":149,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Periodic Table of LLM Reasoning: A Structured Survey of Reasoning Paradigms, Methods, and Failure Modes","primary_cat":"cs.CL","submitted_at":"2026-06-09T21:59:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A literature survey that introduces a taxonomy for LLM reasoning paradigms, analyzes methodological trends, and synthesizes failure modes from over 300 papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09926","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Sample Where You Struggle: Sharpening Base Model Reasoning via Entropy-Guided Power Sampling","primary_cat":"cs.LG","submitted_at":"2026-06-07T14:06:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EGPS localizes MCMC moves to high-entropy decision points using forward-pass entropy, yielding up to 12.6× wall-clock speedup and best-or-tied accuracy on MATH500, HumanEval, and GPQA for Qwen2.5-Math-7B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06188","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Tell-Tale Norm: $\\ell_2$ Magnitude as a Signal for Reasoning Dynamics in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-04T13:59:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The L2 norm of LLM hidden states signals reasoning intensity, with a theoretical bound on SAE feature activations, enabling three new test-time scaling techniques that boost performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01168","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Thinking Economically: A Hierarchical Framework for Adaptive-Complexity Reasoning in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-31T11:20:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HAB applies coarse-to-fine budgeting to LLM reasoning, predicting per-problem depth and learning intra-step token budgets via PPL comparisons and adaptive Pareto optimization, yielding higher accuracy and lower token use than standard CoT on GSM8K and MATH500.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30789","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Smaller Models are Natural Explorers for Policy-Level Diversity in GRPO","primary_cat":"cs.LG","submitted_at":"2026-05-29T03:25:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Smaller models provide temporally correlated policy-level diversity that serves as structured exploration for training larger models in GRPO, yielding accuracy gains such as +8.8% on AIME 24 with reduced compute via the S2L-PO framework.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21883","ref_index":1,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Token-weighted Direct Preference Optimization with Attention","primary_cat":"cs.CL","submitted_at":"2026-05-21T01:43:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AttentionPO weights tokens in DPO using LLM attention as a pairwise judge, yielding better results on AlpacaEval, MT-Bench, and ArenaHard than prior preference optimization methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14457","ref_index":17,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Stateful Reasoning via Insight Replay","primary_cat":"cs.AI","submitted_at":"2026-05-14T06:52:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InsightReplay improves long CoT reasoning by extracting critical insights from the trace and replaying them near the active frontier, delivering +1.65 average accuracy gain across 24 model-benchmark settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07579","ref_index":17,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Your Language Model is Its Own Critic: Reinforcement Learning with Value Estimation from Actor's Internal States","primary_cat":"cs.LG","submitted_at":"2026-05-08T10:49:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"POISE trains a lightweight probe on the actor's internal states to predict expected rewards for RLVR, matching DAPO performance on math benchmarks with lower compute by avoiding extra rollouts or critic models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Nevertheless, our estimator closely tracks the critic while requiring only a lightweight probe over features already produced by the policy forward pass. Generalizability across domains and models.Next, we evaluate the estimator's generalizability across multiple RLVR domains and policy models. For datasets, we include two mathematical reasoning tasks from DAPO-Math 17K [37] and DeepScaleR [17], coding tasks from AceCoder [38], tool-calling dialogues from ToolDial [26], and instruction-following tasks from IF-RLVR [22]. For policy models, we consider Qwen3-4B and DeepSeek-R1-Distill-Qwen-1.5B/7B. For each domain- model pair, we train both our estimator and a critic model using the same data, and compare their predictions against the actual avg@8 scores of the target policy model (For detailed settings, refer"},{"citing_arxiv_id":"2605.08283","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HTPO: Towards Exploration-Exploitation Balanced Policy Optimization via Hierarchical Token-level Objective Control","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:38:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HTPO introduces hierarchical token-level objective control in RLVR to balance exploration and exploitation by grouping tokens according to difficulty, correctness, and entropy, yielding up to 8.6% gains on AIME benchmarks over DAPO.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[12] Seungjae Jung, Gunsoo Han, Daniel Wontae Nam, and Kyoung-Woon On. Binary classifier optimization for large language model alignment.In ACL, 2025. [13] Ang Li, Zhihang Yuan, Yang Zhang, Shouda Liu, and Yisen Wang. Know when to explore: Difficulty-aware certainty as a guide for llm reinforcement learning.arXiv preprint arXiv: 2509.00125, 2025. [14] Zicheng Lin, Tian Liang, Jiahao Xu, Xing Wang, Ruilin Luo, Chufan Shi, Siheng Li, Yujiu Yang, and Zhaopeng Tu. Critical tokens matter: Token-level contrastive estimation enhence llm's reasoning capability.arXiv preprint arXiv: 2411.19943, 2024. [15] Yu Meng, Mengzhou Xia, and Danqi Chen. Simpo: Simple preference optimization with a reference-free reward."},{"citing_arxiv_id":"2605.07260","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"When Are Experts Misrouted? Counterfactual Routing Analysis in Mixture-of-Experts Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T05:26:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Standard top-k routers in MoE language models often select suboptimal routes for difficult tokens, and updating only the final router layer raises pass@K on AIME and HMMT benchmarks across multiple models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26940","ref_index":11,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Select to Think: Unlocking SLM Potential with Local Sufficiency","primary_cat":"cs.CL","submitted_at":"2026-04-29T17:51:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Select to Think reframes LLM help as ranking among SLM top-K candidates and distills the ranking ability back into the SLM for improved single-pass reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16158","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AtManRL: Towards Faithful Reasoning via Differentiable Attention Saliency","primary_cat":"cs.CL","submitted_at":"2026-04-17T15:27:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AtManRL learns an additive attention mask on CoT traces to produce a saliency reward that, when combined with outcome rewards in GRPO, trains LLMs to generate reasoning that genuinely influences final predictions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.01939","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond the 80/20 Rule: High-Entropy Minority Tokens Drive Effective Reinforcement Learning for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2025-06-02T17:54:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"High-entropy minority tokens drive RLVR gains, so restricting gradients to the top 20% maintains or improves performance over full updates on Qwen3 models, especially larger ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}