{"total":101,"items":[{"citing_arxiv_id":"2606.30626","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DOPD: Dual On-policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-29T17:55:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DOPD is an advantage-aware dual distillation method that dynamically assigns token supervision from either privileged teacher or student to transfer capability while mitigating non-replicable information asymmetry in on-policy distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30445","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Does Online Imitation Learning Help in LLM Post-Training? The Role of (Non-)Realizability Beyond Horizon","primary_cat":"cs.LG","submitted_at":"2026-06-29T15:17:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Online IL overcomes an information-theoretic bottleneck that offline IL faces in non-realizable settings even at horizon 1, under a new structural characterization of reward-relative misspecification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.30345","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Difficulty Routing Self-DIstillation with Rhythm-Gated Exploration and Success BuFfer Training","primary_cat":"cs.LG","submitted_at":"2026-06-29T14:20:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DRIFT is an online self-evolution policy optimization framework using Difficulty Routing, Rhythm Gating, success buffers, and two-stage curriculum learning that reports new SOTA results on five reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.29502","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UCOB: Learning to Utilize and Evolve Agentic Skills via Credit-Aware On-Policy Bidirectional Self-Distillation","primary_cat":"cs.AI","submitted_at":"2026-06-28T17:02:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UCOB improves agentic RL by using return-to-go comparisons between skill-conditioned and no-skill prompts as local teachers for bidirectional self-distillation and skill memory updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27814","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ATOD: Annealed Turn-aware On-policy Distillation for Multi-turn Autonomous Agents","primary_cat":"cs.AI","submitted_at":"2026-06-26T07:56:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ATOD anneals from on-policy distillation to RL with turn-level reweighting to improve multi-turn agent success rates on ALFWorld, WebShop, and Search-QA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22600","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Position Bias of On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-06-21T17:20:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position bias in on-policy distillation degrades later-token supervision; IW-OPD weights tokens by accumulated discrepancy, yielding faster convergence and up to 6.9 point gains on AIME-2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01476","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniOPD: Logit-Free On-Policy Distillation via Speculative Verification","primary_cat":"cs.LG","submitted_at":"2026-05-31T22:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniOPD replaces token-level logit matching in on-policy distillation with Monte Carlo chunk-level semantic verification and a peak-entropy scheduler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00755","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Internalize the Temperature: On-Policy Self-Distillation as Policy Reheater for Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:44:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TS-OPSD internalizes temperature via on-policy self-distillation to reheat entropy-collapsed RL policies in LLMs, providing stronger initialization for further training than continued RL or rollout temperature adjustment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00523","ref_index":119,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ProactiveLLM: Learning Active Interaction for Streaming Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-30T04:31:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ProactiveLLM enables active interaction in streaming LLMs by learning semantic sufficiency cues from partial inputs through mask-based modeling and synchronized privileged self-distillation without external supervision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30712","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ExpGraph: Model-Agnostic Experience Learning with Graph-Structured Memory for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-29T01:04:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ExpGraph builds a graph of summarized agent experiences and uses graph diffusion plus an RL-trained retrieval copilot to improve frozen LLM executors on QA, math, code, and agentic tasks without parameter updates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30070","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Predictive Law for On-Policy Self-Distillation From World Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-28T15:17:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A linear relationship between initial student-self-teacher performance gap and OPSD improvement provides a predictive law across contexts and model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29548","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Larger Models Learn More: Effects of Capacity, Interference, and Rare-Task Retention","primary_cat":"cs.LG","submitted_at":"2026-05-28T08:02:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Larger models succeed on rare and complex tasks by reducing gradient interference from common tasks, allowing rare-task features to accumulate, as shown via synthetic task mixtures and OLMo pretraining from 4M to 4B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29495","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Replay for Continual Supervised Fine-Tuning","primary_cat":"cs.LG","submitted_at":"2026-05-28T07:19:47+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-Policy Replay filters model rollouts on historical prompts by task reward and replays them as ordinary SFT examples, reducing backward transfer degradation on the TRACE benchmark across three 7-8B models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29089","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OISD: On-Policy Internal Self-Distillation of Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-27T20:43:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OISD improves mathematical reasoning in language models by using the final layer as an internal teacher to align logits and attention patterns in selected intermediate layers via signed advantage-weighted Jensen-Shannon divergence during GRPO optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28600","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transformers Provably Learn to Internalize Chain-of-Thought","primary_cat":"cs.LG","submitted_at":"2026-05-27T15:17:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"L-layer transformers under Log-ICoT curriculum provably learn k-parity with poly(n) samples and log k stages, matching explicit CoT efficiency without inference overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28396","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ADWIN: Adaptive Windows for Horizon-Aware On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-27T12:33:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ADWIN adaptively selects training horizons in on-policy distillation via prefix alignment checks, cutting end-to-end cost by up to 4.1x while matching or exceeding full-rollout accuracy on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28303","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Fact Overwriting to Knowledge Evolution: Causal Editing via On-Policy Self-Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-27T10:59:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper proposes CODE for causal knowledge editing in LLMs via on-policy self-distillation, reducing self-refutation to 1.8% and achieving up to 83.5% multi-hop accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27899","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SKILLC: Learning Autonomous Skill Internalization in LLM Agents via Contrastive Credit Assignment","primary_cat":"cs.AI","submitted_at":"2026-05-27T03:21:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillC converts skill-helpfulness contrast into a policy learning signal via paired rollouts and dual-stream advantage estimation, outperforming prior internalization baselines by 5.5% and 4.4% on ALFWorld and WebShop without runtime skill access.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27186","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAIGO: Mitigating Lost-in-Conversation with History-Cleaned On-Policy Self-Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-26T15:38:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAIGO uses history-cleaned references from the model's own policy to distill better behavior on middle and answer turns, raising Qwen2.5-7B-Instruct sharded accuracy from 52.8 to 66.1 while preserving full-view performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26844","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Not All Disagreement Is Learnable: Token Teachability in On-Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-26T10:56:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Token teachability, based on local compatibility of teacher and student distributions, predicts on-policy distillation gains better than raw KL disagreement and enables TA-OPD to match or exceed full-token performance with 5% tokens across Qwen models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26293","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CroCo: Cross-Lingual Contrastive Preference Tuning on Self-Generations","primary_cat":"cs.CL","submitted_at":"2026-05-25T19:30:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CroCo applies English-reward-ranked self-generations for contrastive preference tuning that improves two LLMs on structured and open-ended tasks across 14 languages without language-specific annotations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25381","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Not only where, But when: Temporal Scheduling for RLVR","primary_cat":"cs.LG","submitted_at":"2026-05-25T03:10:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Temporal scheduling of credit allocation criteria over RLVR training, using trajectory percentiles to target heterogeneous behaviors, yields more stable policy entropy and better reasoning benchmark results than static allocation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23493","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EDGE-OPD: Internalizing Privileged Context with Evidence Guided On-Policy Distillation","primary_cat":"cs.AI","submitted_at":"2026-05-22T10:55:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EDGE-OPD adds guided rollouts and evidence masking to on-policy self-distillation, enabling successful learning of target identities where standard OPSD and RLSD fail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22263","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tailoring Teaching to Aptitude: Direction-Adaptive Self-Distillation for LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T10:07:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DASD improves math reasoning in LLMs by adaptively directing self-distillation based on per-token entropy to balance exploration and step accuracy, outperforming prior self-distillation and RLVR baselines on six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22240","ref_index":24,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unlocking Proactivity in Task-Oriented Dialogue","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:46:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces a Cognitive User Simulator modeling stratified personas with hidden concerns and Simulator-Induced Asymmetric-View Policy Optimization to unlock proactive behavior in task-oriented dialogue agents.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22166","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Adapting the Interface, Not the Model: Runtime Harness Adaptation for Deterministic LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Life-Harness evolves reusable interventions from training trajectories to enhance frozen LLM agents on unseen tasks across seven deterministic environments, yielding 88.5% average relative improvement in 116 of 126 model-environment settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21851","ref_index":37,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OPPO: Bayesian Value Recursion for Token-Level Credit Assignment in LLM Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-21T00:55:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPPO derives token-level advantages for LLM RL via Bayesian recursion on oracle signals, recovering prior distillation methods as a special case and showing gains on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21834","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On-Policy Consistency Training Improves LLM Safety with Minimal Capability Degradation","primary_cat":"cs.LG","submitted_at":"2026-05-20T23:56:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-Policy Consistency Training (OPCT) improves LLM safety metrics over supervised fine-tuning while largely preserving capabilities across three model families.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21606","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Are Teacher Tokens Reliable? Position-Weighted On-Policy Self-Distillation for Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-20T18:14:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position-Weighted On-Policy Self-Distillation (PW-OPSD) weights later tokens more heavily after a diagnostic shows position predicts teacher reliability better than entropy, yielding +1.0 and +1.1 Avg@12 gains on AIME 2024/2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21605","ref_index":52,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenEvolve: Self-Evolving Image Generation Agents via Tool-Orchestrated Visual Experience Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-20T18:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenEvolve introduces a self-evolving agent framework for image generation using tool-orchestrated trajectories and Visual Experience Distillation to achieve claimed SOTA results on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19447","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What and When to Distill: Selective Hindsight Distillation for Multi-Turn Agents","primary_cat":"cs.AI","submitted_at":"2026-05-19T07:00:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SERL selectively reweights learning using task success and environment feedback to reach 90.0% success on ALFWorld and 80.1% on WebShop, outperforming RL and distillation baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19436","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CEPO: RLVR Self-Distillation using Contrastive Evidence Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-19T06:46:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CEPO sharpens token credit in RLVR by requiring tokens to be favored by the correct answer and disfavored by wrong answers drawn from rejected rollouts, delivering accuracy gains on five multimodal math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19354","ref_index":55,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Next-Acceleration-Scale Prediction for Autoregressive MRI Reconstruction","primary_cat":"eess.IV","submitted_at":"2026-05-19T04:40:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Next-acceleration-scale autoregressive prediction in discrete latent space with on-policy privileged information distillation yields improved MRI reconstructions from sparse measurements on the fastMRI benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18740","ref_index":66,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Vision-OPD: Learning to See Fine Details for Multimodal LLMs via On-Policy Self-Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-18T17:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Vision-OPD transfers an MLLM's privileged regional perception to its full-image policy through on-policy token-level self-distillation, yielding competitive results on fine-grained visual benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18643","ref_index":36,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Post-Trained MoE Can Skip Half Experts via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-18T16:50:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZEDA turns post-trained static MoE models into dynamic ones via zero-output expert injection and two-stage self-distillation, cutting over 50% expert FLOPs on Qwen3-30B-A3B and GLM-4.7-Flash with small accuracy drops across 11 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18529","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AMR-SD: Asymmetric Meta-Reflective Self-Distillation for Token-Level Credit Assignment","primary_cat":"cs.AI","submitted_at":"2026-05-18T15:14:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AMR-SD adds a reflection bottleneck to compress diagnostic signals into self-generated hints and uses asymmetric Causal Information Gain to create sparse token-level advantage signals, outperforming baselines and preventing late-stage collapse in RLVR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20258","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"It Takes Two: Complementary Self-Distillation for Contextual Integrity in LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-18T13:57:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SELFCI uses complementary self-distillation with two reverse KL divergences to align LLMs to contextual integrity while preserving utility, outperforming RL baselines like GRPO in agentic settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18299","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SD-Search: On-Policy Hindsight Self-Distillation for Search-Augmented Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-18T12:18:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SD-Search derives step-level supervision for search queries in reasoning agents via on-policy hindsight self-distillation using the policy as both student and teacher.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18141","ref_index":61,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Brief Overview: On-Policy Self-Distillation In Large Language Models","primary_cat":"cs.HC","submitted_at":"2026-05-18T09:47:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"This overview paper explains the conceptual foundations and design principles of On-Policy Self-Distillation for large language models from a beginner's perspective.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17862","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"$\\boldsymbol{f}$-OPD: Stabilizing Long-Horizon On-Policy Distillation with Freshness-Aware Control","primary_cat":"cs.LG","submitted_at":"2026-05-18T05:14:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"f-OPD decomposes on-policy distillation drift into rollout and supervision components, then applies a sample-level freshness score to adaptively limit stale data influence and stabilize long-horizon agent training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16865","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MixSD: Mixed Contextual Self-Distillation for Knowledge Injection","primary_cat":"cs.CL","submitted_at":"2026-05-16T07:57:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MixSD uses dynamic mixing of the model's expert and naive conditionals to create distribution-aligned supervision that improves the memorization-retention tradeoff over standard SFT.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15604","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VSPO: Vector-Steered Policy Optimization for Behavioral Control","primary_cat":"cs.LG","submitted_at":"2026-05-15T04:31:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VSPO samples rollouts at varying steering intensities to improve behavioral control in LLMs while preserving task accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15181","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Plans to Pixels: Learning to Plan and Orchestrate for Open-Ended Image Editing","primary_cat":"cs.CV","submitted_at":"2026-05-14T17:58:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A planner-orchestrator system learns long-horizon image editing by maximizing outcome-based rewards from a vision-language judge and refining plans from successful trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15113","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning from Language Feedback via Variational Policy Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:27:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VPD frames language feedback learning as variational EM so the teacher policy refines itself via trust-region updates on outcomes while the student learns dense token distributions on its own rollouts, outperforming fixed-teacher baselines on reasoning and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15239","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reducing the Safety Tax in LLM Safety Alignment with On-Policy Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-14T03:40:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"On-policy self-distillation with teacher flip rate yields better safety-reasoning tradeoffs than off-policy or external-teacher baselines across model scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13724","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation","primary_cat":"cs.CV","submitted_at":"2026-05-13T16:06:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AnyFlow enables any-step video diffusion by distilling flow-map transitions over arbitrary time intervals with on-policy backward simulation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13643","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prefix Teach, Suffix Fade: Local Teachability Collapse in Strong-to-Weak On-Policy Distillation","primary_cat":"cs.CL","submitted_at":"2026-05-13T15:05:30+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13255","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Respecting Self-Uncertainty in On-Policy Self-Distillation for Efficient LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-13T09:38:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EGRSD and CL-EGRSD advance the accuracy-length frontier in LLM reasoning by entropy-guided weighting of token-level distillation signals from the teacher.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12913","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting DAgger in the Era of LLM-Agents","primary_cat":"cs.LG","submitted_at":"2026-05-13T02:40:28+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DAgger-style training with turn-level policy interpolation raises 4B and 8B LLM agents to 27.3% and 29.8% on SWE-bench Verified, beating several larger published systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15220","ref_index":45,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Always Learning, Always Mixing: Efficient and Simple Data Mixing All The Time","primary_cat":"cs.CL","submitted_at":"2026-05-13T02:29:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OP-Mix is an on-policy data mixing method that uses low-rank adapter interpolation to find near-optimal data mixtures throughout language model training with reduced compute.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}