{"total":51,"items":[{"citing_arxiv_id":"2606.18056","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ConSA: Controllable Sparsity in Hybrid Attention via Learnable Allocation","primary_cat":"cs.CL","submitted_at":"2026-06-16T15:33:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ConSA learns FA/SWA allocation via L0 masks and augmented Lagrangian constraints, outperforming rule-based baselines on 0.6B and 1.7B models with consistent layer patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01476","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniOPD: Logit-Free On-Policy Distillation via Speculative Verification","primary_cat":"cs.LG","submitted_at":"2026-05-31T22:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniOPD replaces token-level logit matching in on-policy distillation with Monte Carlo chunk-level semantic verification and a peak-entropy scheduler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31494","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Consolidating Rewarded Perturbations for LLM Post-Training","primary_cat":"cs.CL","submitted_at":"2026-05-29T16:16:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoRP consolidates reward-weighted perturbations into a single model via low-rank structure, improving base LLMs by 8.1 points on average while using one-tenth the budget of prior ensembles and one forward pass.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29522","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepSurvey: Enhancing Analytical Depth and Citation Reliability in Automated Survey Generation","primary_cat":"cs.AI","submitted_at":"2026-05-28T07:40:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepSurvey introduces an agentic system for automated survey generation that improves depth through full-text keynotes, cross-paper clustering, and code analysis, while boosting citation reliability via graph expansion, hybrid filtering, and evidence-constrained assignment, with reported gains over ","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22166","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adapting the Interface, Not the Model: Runtime Harness Adaptation for Deterministic LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:36:49+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22156","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"One-Way Policy Optimization for Self-Evolving LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-21T08:25:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OWPO decouples optimization direction from magnitude via asymmetric reweighting (Accelerated Alignment for inferior deviations, Gain Locking for superior) plus iterative references to create a ratchet effect for continuous LLM improvement.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20744","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hack-Verifiable Environments: Towards Evaluating Reward Hacking at Scale","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:46:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Presents Hack-Verifiable TextArena, a benchmark that embeds verifiable reward hacking opportunities into environments to enable deterministic measurement of exploitation by language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19141","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GRASP: Deterministic argument ranking in interaction graphs","primary_cat":"cs.LG","submitted_at":"2026-05-18T21:49:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GRASP aggregates stable local LLM interaction judgments into global argument rankings via a convergent attack-defense propagation operator on interaction graphs, yielding higher reproducibility than holistic judging and no correlation with human convincingness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18109","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TaskGround: Structured Executable Task Inference for Full-Scene Household Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-18T09:19:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TaskGround introduces a Ground-Infer-Execute framework for full-scene household reasoning that improves success rates on the FullHome benchmark and enables compact models to match larger ones at up to 18x lower token cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17937","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"BacktestBench: Benchmarking Large Language Models for Automated Quantitative Strategy Backtesting","primary_cat":"cs.CL","submitted_at":"2026-05-18T06:52:08+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17862","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$\\boldsymbol{f}$-OPD: Stabilizing Long-Horizon On-Policy Distillation with Freshness-Aware Control","primary_cat":"cs.LG","submitted_at":"2026-05-18T05:14:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"f-OPD decomposes on-policy distillation drift into rollout and supervision components, then applies a sample-level freshness score to adaptively limit stale data influence and stabilize long-horizon agent training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17497","ref_index":112,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Supervised On-Policy Distillation for Reasoning Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-17T15:14:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SSOPD converts intra-group correct-wrong contrast into process supervision by distilling a teacher distribution from the shortest correct completion into prefixes of the longest wrong completion, improving GRPO on AIME and HMMT benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16826","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decoupling KL and Trajectories: A Unified Perspective for SFT, DAgger, Offline RL, and OPD in LLM Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-16T06:05:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Decoupling prefix source from token-level KL direction in autoregressive sequence KL yields four objectives unifying SFT, DAgger, offline RL and OPD, with KL mixing and entropy-gated curriculum improving math reasoning accuracy and shortening responses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12741","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning with Rare Success but Rich Feedback via Reflection-Enhanced Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T20:46:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RESD turns failure trajectories into token-level supervision via retrospective reflections and a persistent global playbook, enabling faster improvement than standard self-distillation or GRPO with only one rollout per prompt.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12483","ref_index":20,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond GRPO and On-Policy Distillation: An Empirical Sparse-to-Dense Reward Principle for Language-Model Post-Training","primary_cat":"cs.LG","submitted_at":"2026-05-12T17:57:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sparse rewards on capable teachers for exploration followed by dense distillation to students outperforms direct sparse reward application like GRPO on the deployment model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12070","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Missing Old Logits in Asynchronous Agentic RL: Semantic Mismatch and Repair Methods for Off-Policy Correction","primary_cat":"cs.LG","submitted_at":"2026-05-12T12:57:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Missing old logits in async agentic RL entangle discrepancy and staleness terms in PPO off-policy correction; exact acquisition methods and revised PPO-EWMA restore decoupled updates with reported gains in speed and performance.","context_count":1,"top_context_role":"method","top_context_polarity":"background","context_text":"and multiple actor updates make the behavior policy stale with respect to the current policy. We call this effectpolicy staleness. A natural choice for correction is to decompose the total ratio into two terms: a discrepancy-repair ratio that compares the training-side and inference-side distributions at the same old version, and a staleness-correction ratio that compares the current training policy with that old training-side policy [28, 23, 32, 25, 24]. Let µold denote the inference-side rollout policy, and let πold denote the corresponding training-side forward policy. The desired decomposition is r(θ) = πθ(y|x) µold(y|x) =r drs, r d = πold(y|x) µold(y|x) , r s = πθ(y|x) πold(y|x) .(1) Here rd measures training-inference discrepancy, while rs measures policy staleness. This decompo-"},{"citing_arxiv_id":"2605.11739","ref_index":22,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to Foresee: Unveiling the Unlocking Efficiency of On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-12T08:19:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"On-policy distillation gains efficiency from early foresight in module allocation and update directions, which the proposed EffOPD method exploits for 3x faster training with comparable performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11609","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Anti-Self-Distillation for Reasoning RL via Pointwise Mutual Information","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anti-Self-Distillation reverses self-distillation signals via PMI to fix overconfidence on structural tokens, matching GRPO baseline accuracy 2-10x faster with up to 11.5 point gains across 4B-30B models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Neural networks, 22(5-6):544-557, 2009. [26] Peiyi Wang, Lei Li, Zhihong Shao, Runxin Xu, Damai Dai, Yifei Li, Deli Chen, Yu Wu, and Zhifang Sui. Math-shepherd: Verify and reinforce llms step-by-step without human annotations. InProceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 9426-9439, 2024. [27] Bangjun Xiao, Bingquan Xia, Bo Yang, Bofei Gao, Bowen Shen, Chen Zhang, Chenhong He, Chiheng Lou, Fuli Luo, Gang Wang, et al. Mimo-v2-flash technical report.arXiv preprint arXiv:2601.02780, 2026. [28] Yuanda Xu, Hejian Sang, Zhengze Zhou, Ran He, and Zhipeng Wang. Paced: Distillation and on- policy self-distillation at the frontier of student competence."},{"citing_arxiv_id":"2605.10889","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Unmasking On-Policy Distillation: Where It Helps, Where It Hurts, and Why","primary_cat":"cs.LG","submitted_at":"2026-05-11T17:33:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Distillation signals align better with ideal updates on incorrect student rollouts than correct ones, with optimal teacher context depending on student capacity and task.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16379","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"An Information-Theoretic Criterion for Efficient Data Synthesis","primary_cat":"cs.LG","submitted_at":"2026-05-11T01:27:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Synthetic data improves models only in information-open generation-training loops with external signals, and coarser signals like binary correctness enable better generalization by converging to the most information-efficient component.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09725","ref_index":44,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Distillation with Best-of-N Teacher Rollout Selection","primary_cat":"cs.CV","submitted_at":"2026-05-10T19:49:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BRTS improves on-policy distillation by sampling multiple teacher rollouts and selecting the best one via a correctness-first then alignment priority rule, yielding gains on AIME and AMC math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09253","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Cornerstones or Stumbling Blocks? Deciphering the Rock Tokens in On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-10T01:41:43+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"On-Policy Distillation (OPD) has established itself as a cornerstone of the modern post-training pipeline for Large Language Models (LLMs) [1]. By aligning a student model with a superior teacher through trajectories sampled from its own policy, OPD enables a token level of reasoning refinement that goes beyond static Supervised Fine-Tuning (SFT) [25]. This effectiveness has been validated by industrial works such as DeepSeek-V4 [4], MiMo [30], and Qwen-3 [33], where OPD serves as a vital component alongside SFT or Reinforcement Learning with Verifiable Rewards (RLVR) to further squeeze out reasoning performance. Recent studies in Reinforcement Learning with Verifiable Rewards (RLVR) [6, 7, 11] have revealed that not all tokens contribute equally to model learning: a small subset of critical tokens, such"},{"citing_arxiv_id":"2605.08762","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Omni-DeepSearch: A Benchmark for Audio-Driven Omni-Modal Deep Search","primary_cat":"cs.SD","submitted_at":"2026-05-09T07:47:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Omni-DeepSearch is a 640-sample benchmark for audio-driven omni-modal search where the best model reaches only 43.44% accuracy, exposing bottlenecks in audio inference, tool use, and cross-modal reasoning.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"LLM judges, GPT-5.4 [31], Gemini-3-pro [32], and Claude-Sonnet-4.6 [33], independently assess semantic equivalence, with the final label determined by majority vote. We evaluate both closed-source and open-source models, including Gemini-3-Pro [32], Gemini-3-Flash [32], Gemini-2.5-Pro [34], Gemini-2.5-Flash-Lite [34], Qwen3.5-Omni-Plus/Flash [35], Mimo-V2-Omni [36], Mimo- V2.5 [37], Qwen3-Omni-30B-A3B [38], and Qwen2.5-Omni [39]. We report the overall accuracy on 640 data instances, as well as accuracy by retrieval target modality and audio content type. Implementation details are provided in Appendix E. 4.3 Main Results Table 2 presents the main results on Omni-DeepSearch. Frontier models show clear advantages, but overall performance remains far from saturated."},{"citing_arxiv_id":"2605.08761","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Beyond the All-in-One Agent: Benchmarking Role-Specialized Multi-Agent Collaboration in Enterprise Workflows","primary_cat":"cs.MA","submitted_at":"2026-05-09T07:47:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EntCollabBench shows that today's LLM agents still struggle with delegation, context transfer, parameter grounding, workflow closure, and decision commitment when tested in a simulated enterprise with 11 role-specialized agents.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"1 Experimental Setup We evaluated both closed-source and open-source models, including Claude-Sonnet-4.6 [18], Gemini-3.1- Pro-Preview [19], Gemini-3.1-Flash-Lite-Preview [19], GPT-5.4 [20], GPT-5-mini[20], DeepSeek-V4-Pro [21], DeepSeek-V4-Flash [21], Qwen3.5-122B-A10B [ 22], Qwen3.5-35B-A3B [ 22], Qwen3.5-9B [ 22], MiniMax- M2.7 [23], and MiMo-V2-Flash [24]. We report the overall accuracy, as well as the step-wise and multi-step accuracy for workflow and approval. In addition, we recorded the accuracy of each agent and computed both the average token cost per task and the average token cost for successfully completed tasks. Details of the settings are provided in Appendix E.2. 5.2 Main Results EntCollabBench remains challenging even for the strongest models."},{"citing_arxiv_id":"2605.08741","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training with Harnesses: On-Policy Harness Self-Distillation for Complex Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-09T07:06:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPHSD uses harness-augmented models as teachers to distill reasoning capabilities into base LLMs, yielding strong standalone performance on classification and math tasks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"OPHSD + Harness on the memory bank built online from the test data. 4.1 Experimental Setup Model and datasets.All experiments start from Qwen3-8B[ 36]. For online text classification, we conduct two independent training runs (sampling 10k training examples each) to evaluate the following tasks: (1) Legal Charge Prediction: We train on the CAIL-2018 [34] dataset and evaluate on LawBench [8], reporting the F1 score for predicting criminal charges from case descriptions (215 classes). (2) Chemical Reaction Prediction: We train on the USPTO-50k [ 25] dataset and evaluate on the USPTO test set, reporting accuracy for predicting reaction types from chemical equations (10 classes). All training samples pass a strict contamination filter against both test sets."},{"citing_arxiv_id":"2605.08063","ref_index":20,"ref_count":4,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Flow-OPD: On-Policy Distillation for Flow Matching Models","primary_cat":"cs.CV","submitted_at":"2026-05-08T17:50:15+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":", OCR) inevitably degrades aesthetics via reward hacking. This necessitates a shift to dense, trajectory-level distillation to provide uncoupled expert supervision. This issue has recently found a compelling solution in the field of Large Language Models (LLMs): On-Policy Distillation (OPD). Benefiting from OPD, models such as DeepSeek-V4 [ 9], Mimo v2 [20], and GLM-5 [19] successfully harmonize complex, multi-domain capabilities by distilling from specialized experts. This paradigm shift raises a pivotal question for the vision community: Can Flow Matching models similarly leverage OPD to integrate the diverse strengths of multiple teacher models into a single, robust student model?To address this pivotal question, we introduce"},{"citing_arxiv_id":"2605.07442","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GameGen-Verifier: Parallel Keypoint-Based Verification for LLM-Generated Games via Runtime State Injection","primary_cat":"cs.LG","submitted_at":"2026-05-08T08:46:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GameGen-Verifier decomposes game specifications into keypoints, injects runtime states for targeted checks, and achieves 92.2% accuracy on 100 games while running up to 16.6x faster than agent-based baselines.","context_count":1,"top_context_role":"background","top_context_polarity":"unclear","context_text":"video games: A case study for runtime monitoring.Computers in Entertainment, 15(1):1-28, 2017. [42] Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Beibin Li, Erkang Zhu, Li Jiang, Xiaoyun Zhang, Shaokun Zhang, Jiale Liu, Ahmed Hassan Awadallah, Ryen W White, Doug Burger, and Chi Wang. Autogen: Enabling next-gen llm applications via multi-agent conversation, 2023. [43] Xiaomi LLM-Core Team. Mimo-v2-flash technical report.arXiv preprint arXiv:2601.02780, 2026. [44] Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Thomas L. Griffiths, Yuan Cao, and Karthik Narasimhan. Tree of thoughts: Deliberate problem solving with large language models. In Advances in Neural Information Processing Systems, 2023. [45] Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang,"},{"citing_arxiv_id":"2605.07396","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rubric-based On-policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-08T07:52:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Rubric-based on-policy distillation allows training student models using only teacher responses by generating scoring rubrics from contrasts and using them for on-policy optimization, achieving superior performance and up to 10x better sample efficiency than logit-based approaches.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"On-policy distillation of language models: Learning from self-generated mistakes. In International Conference on Learning Representations, 2024. [7] Kevin Lu and Thinking Machines Lab. On-policy distillation.Thinking Machines Lab: Connec- tionism, 2025. [8] Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. Minillm: Knowledge distillation of large language models. InInternational Conference on Learning Representations, 2024. [9] Bangjun Xiao, Bingquan Xia, Bo Yang, Bofei Gao, Bowen Shen, Chen Zhang, Chenhong He, Chiheng Lou, Fuli Luo, Gang Wang, et al. Mimo-v2-flash technical report.arXiv preprint arXiv:2601.02780, 2026. [10] DeepSeek-AI. Deepseek-v4: Towards highly efficient million-token context intelligence, 2026. [11] David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien"},{"citing_arxiv_id":"2605.06221","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"UniPrefill: Universal Long-Context Prefill Acceleration via Block-wise Dynamic Sparsification","primary_cat":"cs.CL","submitted_at":"2026-05-07T13:18:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"UniPrefill accelerates LLM prefill via block-wise dynamic sparsification, achieving up to 2.1x TTFT speedup while supporting hybrid architectures and native vLLM continuous batching.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Uszkoreit, L. Jones, A. N. Gomez, Ł. Kaiser, and I. Polosukhin. Attention is all you need. Advances in Neural Information Processing Systems, 30, 2017. [27] Y. Wang, H. He, S. Bao, H. Wu, H. Wang, Q. Zhu, and W. Che. Proxyattn: Guided sparse attention via representative heads. arXiv preprint arXiv:2509.24745, 2025. URL https://arxiv.org/abs/2509.24745. [28] B. Xiao, B. Xia, B. Yang, et al. MiMo-V2-Flash technical report. arXiv preprint arXiv:2601.02780, 2026. [29] G. Xiao, Y. Tian, B. Chen, S. Han, and M. Lewis. Efficient streaming language models with attention sinks. In ICLR, 2024. [30] G. Xiao, J. Guo, K. Mazaheri, and S. Han. Optimizing mixture of block attention. arXiv preprint arXiv:2511.11571, 2025."},{"citing_arxiv_id":"2605.03677","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Uni-OPD: Unifying On-Policy Distillation with a Dual-Perspective Recipe","primary_cat":"cs.LG","submitted_at":"2026-05-05T12:15:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Uni-OPD unifies on-policy distillation across LLMs and MLLMs with dual-perspective strategies that promote student exploration and enforce order-consistent teacher supervision based on outcome rewards.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02971","ref_index":11,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Multilingual Safety Alignment via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-03T14:22:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSD enables cross-lingual safety transfer in LLMs via self-distillation with Dual-Perspective Safety Weighting, improving safety in low-resource languages without target response data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.27083","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Co-Evolving Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-29T18:24:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoPD integrates multiple expert capabilities by running parallel RLVR training with bidirectional online policy distillation among experts, outperforming mixed RLVR and sequential OPD while surpassing domain-specific experts on text-image-video reasoning.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"We evaluateCoPDon its ability to co-evolve text, image, and video reasoning capabilities through parallel branch training. Our main analysis focuses on the two-branch setting with text and image reasoning; we additionally evaluate a three-branch setting that incorporates video reasoning to demonstrate scalability. For text reasoning, we use Polaris-Dataset-53K [17], filtered from DeepScaleR-Preview-Dataset [18] and AReal-boba-Data [19] to retain high-quality mathematical reasoning problems. For image reasoning, we use MMFineReason-123K [20], a collection of image reasoning samples with verifiable answers. For video reasoning, we collect training data from OneThinker [21], VideoChat-R1 [22], and Video-R1 [9], and filter with Qwen3-8B-VL by removing samples with a pass rate of either 0% or 100%,"},{"citing_arxiv_id":"2604.21850","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"OptiMat Alloys: a FAIR, living database of multi-principal element alloys enabled by a conversational agent","primary_cat":"cond-mat.mtrl-sci","submitted_at":"2026-04-23T16:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OptiMat Alloys is a conversational AI system that maintains a living FAIR database of multi-principal element alloy calculations and enables natural-language, on-demand computations with built-in uncertainty checks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15039","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Prefill-as-a-Service: KVCache of Next-Generation Models Could Go Cross-Datacenter","primary_cat":"cs.DC","submitted_at":"2026-04-16T14:07:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrfaaS enables practical cross-datacenter prefill-decode disaggregation for hybrid-attention models via selective offloading, bandwidth-aware scheduling, and cache-aware placement, yielding 54% higher throughput and 64% lower P90 TTFT than homogeneous baselines in a 1T-parameter case study.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13016","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking On-Policy Distillation of Large Language Models: Phenomenology, Mechanism, and Recipe","primary_cat":"cs.LG","submitted_at":"2026-04-14T17:54:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy distillation works when student and teacher models share thinking patterns and the teacher adds new capabilities, with success tied to alignment on a small set of high-probability tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13010","ref_index":5,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Lightning OPD: Efficient Post-Training for Large Reasoning Models with Offline On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-14T17:44:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lightning OPD is an offline on-policy distillation method that matches standard OPD performance at 4x efficiency by enforcing teacher consistency between SFT and distillation phases.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Openai gpt-5 system card.arXiv preprint arXiv:2601.03267, 2025. [3] Kimi Team, Tongtong Bai, Yifan Bai, Yiping Bao, SH Cai, Yuan Cao, Y Charles, HS Che, Cheng Chen, Guanduo Chen, et al. Kimi k2. 5: Visual agentic intelligence.arXiv preprint arXiv:2602.02276, 2026. [4] NVIDIA. Nvidia nemotron 3: Efficient and open intelligence.arXiv preprint arXiv:2512.20856, 2025. [5] Bangjun Xiao, Bingquan Xia, Bo Yang, Bofei Gao, Bowen Shen, Chen Zhang, Chenhong He, Chiheng Lou, Fuli Luo, Gang Wang, et al. Mimo-v2-flash technical report.arXiv preprint arXiv:2601.02780, 2026. [6] Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. Training language models to follow instructions with human feedback."},{"citing_arxiv_id":"2604.11912","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"How Transformers Learn to Plan via Multi-Token Prediction","primary_cat":"cs.LG","submitted_at":"2026-04-13T18:04:09+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-token prediction induces a two-stage reverse reasoning process in Transformers via gradient decoupling, improving planning on synthetic and realistic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10098","ref_index":54,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation","primary_cat":"cs.LG","submitted_at":"2026-04-11T08:41:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The first survey on Attention Sink in Transformers structures the literature around fundamental utilization, mechanistic interpretation, and strategic mitigation.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"systematically reviewed, resulting in the absence of a definitive and unified reference for the field. 4 Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation Attention Sink in Transformers: A Survey on Utilization, Interpretation, and Mitigation Attention Sink in Transformers (§2) Classical Language Models (§2.3.1)e.g.,[46], [47], [48], [29], [49], [50], [51], [52], [53]. Large Language Models (§2.3.2) e.g.,[54], [55], [56], [57], [58], [59], [41], [60], [61], [62], [33], [63], [64], [36], [37],[38], [44], [7], [65], [39], [28], [66], [46], [47], [67], [68], [43], [69], [70], [71], [72],[42], [73], [74], [26], [75], [76], [77], [27], [78], [79], [34], [80], [81], [82], [83], [84],[85], [86], [87], [88], [89], [90], [45], [91], [91], [92], [93], [94], [95], [96], [97],etc."},{"citing_arxiv_id":"2604.08527","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Demystifying OPD: Length Inflation and Stabilization Strategies for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-09T17:58:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OPD for LLMs suffers length inflation and repetition collapse; StableOPD uses reference divergence and rollout mixing to prevent it and improve math reasoning performance by 7.2% on average.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07941","ref_index":121,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Large Language Model Post-Training: A Unified View of Off-Policy and On-Policy Learning","primary_cat":"cs.CL","submitted_at":"2026-04-09T08:00:37+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LLM post-training is unified as off-policy or on-policy interventions that expand support for useful behaviors, reshape policies within reachable states, or consolidate behavior across training stages.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"\"On-Policy RL Meets Off-Policy Experts: Harmonizing Supervised Fine-Tuning and Reinforcement Learning via Dynamic Weighting\". In:The Fourteenth International Conference on Learning Representations. 2026.url: https://openreview.net/forum?id=dCm9bBrk5d. [120] H. Wang et al. \"Learning to Align, Aligning to Learn: A Unified Approach for Self-Optimized Alignment\". In:arXiv preprint arXiv:2508.07750(2025). [121] B. Xiao et al. \"MiMo-V2-Flash Technical Report\". In:arXiv preprint arXiv:2601.02780(2026). [122] Z. Yang et al. \"Nemotron-Cascade 2: Post-Training LLMs with Cascade RL and Multi-Domain On-Policy Distillation\". In:arXiv preprint arXiv:2603.19220(2026). [123] A. Dixit, T. Liang, and J. Telang. \"Project Aletheia: Verifier-Guided Distillation of Backtracking for Small"},{"citing_arxiv_id":"2604.07054","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Sell More, Play Less: Benchmarking LLM Realistic Selling Skill","primary_cat":"cs.CL","submitted_at":"2026-04-08T13:06:37+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SalesLLM provides an automatic evaluation framework for LLM sales dialogues that correlates 0.98 with human experts and shows top models approaching human performance while weaker ones lag.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05688","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Attention Editing: A Versatile Framework for Cross-Architecture Attention Conversion","primary_cat":"cs.CL","submitted_at":"2026-04-07T10:40:16+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention Editing converts pre-trained LLMs to new attention architectures through layer-wise teacher-forced optimization and model-level distillation, preserving performance with efficiency gains.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"For causal self-attention, the output of thei-th head at steptis αt,i,j= exp ( q⊤ t,ikj,i/√dh ) ∑t s=1 exp ( q⊤ t,iks,i/√dh ),1≤j≤t,(7) ot,i= t∑ j=1 αt,i,jvj,i,(8) 3 Attention Editing: A Versatile Framework for Cross-Architecture Attention Conversion and the final attention output is obtained by concatenating all heads and applying the output projection, ut =W O[ot,1;o t,2;···;ot,nh ].(9) Equivalently, if we stack all positions into matrices Qi, Ki, and Vi for the i-th head, then Oi = softmax(QiK⊤ i /√dh)Vi, which matches the standard scaled dot-product attention form. 3.2 Efficient Attention Architectures To reduce the decoding-time memory cost of KV-cache, several efficient attention variants modify the above computation while retaining the same head-level notation."},{"citing_arxiv_id":"2604.05623","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DetailVerifyBench: A Benchmark for Dense Hallucination Localization in Long Image Captions","primary_cat":"cs.CV","submitted_at":"2026-04-07T09:27:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DetailVerifyBench supplies 1,000 images and densely annotated long captions to evaluate precise hallucination localization in multimodal large language models.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"For open-source models, we include GLM-4.6V-Flash [41], Step3-VL-10B [16], Qwen3- VL-8B-Thinking [5], Qwen3.5 series (9B, 35B-A3B, 397B-A17B) [2], KIMI-K2.5 [ 39]. We also evaluate a decoding-based halluci- nation mitigation plugin, visual contrastive decoding [19], applied on top of Qwen3-VL-8B-Thinking. For closed-source models, we test Seed2.0-pro [40], Mimo-v2-pro [37], GPT-5.2 [3], GPT-5.4 [4], Gemini-3-Pro-Preview, Gemini-3.1-Pro-Preview [1], and Claude- Opus-4.6. All models are prompted with a unified instruction that requires them to reproduce the input caption while wrapping hal- lucinated tokens with <HALLUCINATION></HALLUCINATION> tags. For overall evaluation, we adopt Precision (𝑃), Recall (𝑅), and F1-"},{"citing_arxiv_id":"2604.03128","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Distilled RLVR","primary_cat":"cs.LG","submitted_at":"2026-04-03T15:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RLSD mixes self-distillation for token-level policy difference magnitudes with RLVR for reliable update directions from response correctness to reach higher convergence and better training stability.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"On-policy distillation (OPD) [4, 5] complements this by leveraging a stronger teacher model to provide dense, token-level logits as learning signals along the student's own sampled trajectories, enriching the trajectory-level supervision to the token level and thereby achieving faster convergence. Recent work has shown that OPD from advanced teachers can match or even outperform RLVR [6], establishing it as an equally compelling paradigm (see Table 1 for a systematic comparison). Despite its effectiveness, OPD relies on a separate, typically much larger teacher model, incurring substantial computational overhead. Moreover, since OPD requires computing token-level distributions over a shared vocabulary, the teacher and student models must share the same vocabulary, significantly reducing the"},{"citing_arxiv_id":"2604.03044","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"JoyAI-LLM Flash: Advancing Mid-Scale LLMs with Token Efficiency","primary_cat":"cs.CL","submitted_at":"2026-04-03T13:52:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JoyAI-LLM Flash delivers a 48B MoE LLM with 2.7B active parameters per token via FiberPO RL and dense multi-token prediction, released with checkpoints on Hugging Face.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"Table 4 evaluates the MTP performance of JoyAI-LLM Flash on SpechBench [94] under the MTP 3 layers and concurrency 64 configuration. The performance is evaluated by the acceptance length, ratio and speedup over the non-MTP counterpart. We compare JoyAI-LLM Flash with a suite of MTP-optimized LLMs, including Qwen3.5- 35B-A3B [6], Step-3.5-Flash [7], MiMo-V2-Flash [15], GLM-5 [95], GLM-4.7-Flash [4], DeepSeek-V3.2 [36], and DeepSeek-V3 [8]. JoyAI-LLM Flash achieves the highest speedup of 1.87×, representing a 3% improvement over the closest competitor, GLM-5 (1.82×), and a 72% improvement over the slowest model, GLM-4.7-Flash (1.09×). Table 4: SpecBench MTP-3 Speculative Decoding Performance. Best results are marked in bold."},{"citing_arxiv_id":"2604.01496","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From SWE-ZERO to SWE-HERO: Execution-free to Execution-based Fine-tuning for Software Engineering Agents","primary_cat":"cs.SE","submitted_at":"2026-04-02T00:11:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A two-stage SFT pipeline distills execution-free then execution-based trajectories from a 480B model into smaller Qwen2.5-Coder agents, yielding 62.2% resolution on SWE-bench Verified and 44.1% zero-shot on the multilingual version.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.00626","ref_index":6,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of On-Policy Distillation for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-04-01T08:32:34+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.12125","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning beyond Teacher: Generalized On-Policy Distillation with Reward Extrapolation","primary_cat":"cs.LG","submitted_at":"2026-02-12T16:14:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Generalized on-policy distillation with reward scaling above one (ExOPD) lets student models surpass teacher performance when merging domain experts on math and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09557","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPEED-Bench: A Unified and Diverse Benchmark for Speculative Decoding","primary_cat":"cs.DC","submitted_at":"2026-02-10T16:19:56+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.02994","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Video-OPD: Efficient Post-Training of Multimodal Large Language Models for Temporal Video Grounding via On-Policy Distillation","primary_cat":"cs.CV","submitted_at":"2026-02-03T02:05:48+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}