{"total":28,"items":[{"citing_arxiv_id":"2605.22211","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CLORE: Content-Level Optimization for Reasoning Efficiency","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:16:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLORE augments correct on-policy rollouts by deleting repetitive and irrelevant segments then optimizes with auxiliary DPO to improve accuracy-efficiency trade-off on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22177","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Maestro: Reinforcement Learning to Orchestrate Hierarchical Model-Skill Ensembles","primary_cat":"cs.LG","submitted_at":"2026-05-21T08:47:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Maestro uses outcome-based RL to train a lightweight policy that orchestrates ensembles of frozen expert models and skills, reporting 70.1% average accuracy across ten multimodal benchmarks and outperforming GPT-5 and Gemini-2.5-Pro while generalizing to unseen components.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21792","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Residual Skill Optimization for Text-to-SQL Ensembles","primary_cat":"cs.CL","submitted_at":"2026-05-20T22:36:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Residual skill optimization creates complementary Text-to-SQL agents by training each new skill on prior ensemble failures, yielding accuracy gains on Spider2-Lite and transfer to other dialects and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20865","ref_index":13,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Multi-Step Likelihood-Ratio Correction for Reinforcement Learning with Verifiable Rewards","primary_cat":"cs.LG","submitted_at":"2026-05-20T08:01:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"NFPO augments the PPO surrogate with N-step forward traces to bridge local approximations and exact policy gradients, delivering tighter policy-improvement bounds and improved results on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20833","ref_index":26,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"MemGym: a Long-Horizon Memory Environment for LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-20T07:25:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemGym unifies agent gyms into a memory benchmark with isolated scoring across tool-use, research, coding, and computer-use regimes plus a lightweight reward model for tractable coding evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22875","ref_index":28,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"RMA: an Agentic System for Research-Level Mathematical Problems","primary_cat":"cs.AI","submitted_at":"2026-05-20T04:54:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RMA, a multi-agent system with structured memory and iterative feedback loops, solves 8 out of 10 research-level math problems on the new First Proof benchmark and outperforms GPT-5.2R and Aletheia according to expert evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20548","ref_index":36,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"What Do Agents Communicate? Characterizing Information Exchange in Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-05-19T22:51:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Systematic study of inter-agent communication in LLM multi-agent systems shows reasoning and verification are critical for performance, with a new augmentation technique recovering 86.2% of failures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20061","ref_index":20,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Rewarding Beliefs, Not Actions: Consistency-Guided Credit Assignment for Long-Horizon Agents","primary_cat":"cs.CL","submitted_at":"2026-05-19T16:19:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReBel uses belief-consistency supervision and belief-aware grouping to improve credit assignment in long-horizon RL for LLM agents, achieving up to 20.4 percentage points higher success and 2.1x better sample efficiency than GRPO on ALFWorld and WebShop.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19436","ref_index":9,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CEPO: RLVR Self-Distillation using Contrastive Evidence Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-19T06:46:19+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CEPO sharpens token credit in RLVR by requiring tokens to be favored by the correct answer and disfavored by wrong answers drawn from rejected rollouts, delivering accuracy gains on five multimodal math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18165","ref_index":13,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Elastic-dLLM: Position Preserving Context Compression and Augmentation of Diffusion LLMs","primary_cat":"cs.LG","submitted_at":"2026-05-18T10:09:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Position-preserving MASK token compression reduces redundancy in diffusion LLMs to accelerate parallel decoding and enable context folding for longer sequences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17877","ref_index":8,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"PAIR: Prefix-Aware Internal Reward Model for Multi-Turn Agent Optimization","primary_cat":"cs.AI","submitted_at":"2026-05-18T05:39:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PAIR combines a hidden-state probe with an attention correction to deliver robust step-level rewards for GRPO-based optimization of multi-turn LLM agents, achieving high AUROC on contaminated trajectories at low cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17648","ref_index":8,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"SAPO: Step-Aligned Policy Optimization for Reasoning-Based Generative Recommendation","primary_cat":"cs.AI","submitted_at":"2026-05-17T20:53:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SAPO computes per-reasoning-step group-relative advantages in RL to improve credit assignment for structured generation of semantic identifiers in recommendation systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15513","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"CAPS: Cascaded Adaptive Pairwise Selection for Efficient Parallel Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-15T01:16:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CAPS is a four-stage inference-only cascade that adapts how much of each solution the verifier sees and how comparisons are distributed, halving per-candidate verifier tokens while outperforming uniform pairwise verification on most benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13369","ref_index":16,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Query-Conditioned Test-Time Self-Training for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-13T11:27:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"QueST adapts LLMs at test time by generating query-specific problem-solution pairs for self-supervised fine-tuning, improving reasoning performance without external data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11882","ref_index":20,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"On-Policy Self-Evolution via Failure Trajectories for Agentic Safety Alignment","primary_cat":"cs.AI","submitted_at":"2026-05-12T09:56:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FATE lets LLM agents self-evolve safer behaviors by generating and filtering repairs from their own failure trajectories using verifiers and Pareto optimization.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Cola: A choice leakage attack framework to expose privacy risks in subset training.arXiv preprint arXiv:2604.12342, 2026. [19] Yu Li, Haoyu Luo, Yuejin Xie, Yuqian Fu, Zhonghao Yang, Shuai Shao, Qihan Ren, Wanying Qu, Yanwei Fu, Yujiu Yang, et al. Atbench: A diverse and realistic trajectory benchmark for long-horizon agent safety.arXiv preprint arXiv:2604.02022, 2026. [20] Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step. InThe twelfth international conference on learning representations, 2023. [21] Alexander H Liu, Kartik Khandelwal, Sandeep Subramanian, Victor Jouault, Abhinav Rastogi, Adrien Sadé, Alan Jeffares, Albert Jiang, Alexandre Cahill, Alexandre Gavaudan, et al."},{"citing_arxiv_id":"2605.11853","ref_index":10,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"GEAR: Granularity-Adaptive Advantage Reweighting for LLM Agents via Self-Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T09:38:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GEAR adaptively reweights GRPO advantages in LLM RL by using divergence spikes from self-distillation to define semantic segments and modulate local credit.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[8] Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Yang Wu, et al. Deepseekmath: Pushing the limits of mathematical reasoning in open language models.arXiv preprint arXiv:2402.03300, 2024. [9] Richard S Sutton, Andrew G Barto, et al.Reinforcement learning: An introduction, volume 1. MIT press Cambridge, 1998. [10] Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step. InThe twelfth international conference on learning representations, 2023. [11] Siliang Zeng, Quan Wei, William Brown, Oana Frunza, Yuriy Nevmyvaka, Yang Katie Zhao, and Mingyi Hong."},{"citing_arxiv_id":"2605.11613","ref_index":5,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"From Generic Correlation to Input-Specific Credit in On-Policy Self Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:43:17+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-distillation token rewards measure input-response-feedback pointwise mutual information, and CREDIT extracts the input-specific component with contrastive baselines to improve LLM reasoning performance.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"This sparse signal provides no information about which tokens contributed to success and which were irrelevant or harmful, leading to high gradient variance and inefficient learning. Several lines of work address this bottleneck by providing denser reward signals. Process Reward Models (PRMs) train a separate model to score intermediate reasoning steps [5, 6, 7, 8], but require step-level annotations or extensive Monte Carlo rollouts. On-policy distillation (OPD) uses a stronger teacher model to provide token-level supervision on the student's own trajectories [9, 10, 11], offering dense on-policy signals but requiring access to a separate, often larger, teacher model whose quality upper-bounds the student."},{"citing_arxiv_id":"2605.11609","ref_index":14,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Anti-Self-Distillation for Reasoning RL via Pointwise Mutual Information","primary_cat":"cs.LG","submitted_at":"2026-05-12T06:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Anti-Self-Distillation reverses self-distillation signals via PMI to fix overconfidence on structural tokens, matching GRPO baseline accuracy 2-10x faster with up to 11.5 point gains across 4B-30B models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"The reward signal in RLVR, however, is typically a sparse, trajectory-level scalar: a single bit per rollout that does not indicate which intermediate step was responsible, leaving credit assignment to individual reasoning steps as an open problem. To address this, two main directions have emerged: training a separate process reward model (PRM) to score intermediate steps [14; 26; 18], or applying on-policy distillation (OPD) to provide a token-level imitation signal from a stronger teacher [1; 4; 17]. Both, however, depend on an external model. Can the model itself supply this credit? On-policy self-distillation answers this in the affirmative. It specializes OPD by taking the teacher to be the student itself, conditioned on privileged context: typically a verified solution and any feedback"},{"citing_arxiv_id":"2605.09536","ref_index":22,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"TAD: Temporal-Aware Trajectory Self-Distillation for Fast and Accurate Diffusion LLM","primary_cat":"cs.CL","submitted_at":"2026-05-10T13:38:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TAD improves the accuracy-parallelism trade-off in diffusion LLMs via temporal-aware self-distillation that applies hard labels to soon-to-be-decoded tokens and soft supervision to future tokens.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"aggressively expanding the near subset to maximize parallel throughput at a slight cost to accuracy. 4 Experiment 4.1 Experimental Details Training Dataset.As a trajectory-distillation method, we use prompts and ground-truth answers from public datasets, and let the model generate its own responses to construct the training data. For LLaDA-8B-Instruct [5], we sample prompts and ground-truth answers from the training splits of GSM8K [21], PRM12K [22] and a subset of KodCode [23]. We generate target trajectories with a sequence length of 256 and a block length of 32. We adopt a low-confidence remasking strategy, in which the model generates only one token at each step. We record the intermediate state at every decoding step. We then filter out a number of incorrect trajectories, most of which are caused by"},{"citing_arxiv_id":"2605.09188","ref_index":30,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"DARE: Difficulty-Adaptive Reinforcement Learning with Co-Evolved Difficulty Estimation","primary_cat":"cs.LG","submitted_at":"2026-05-09T22:05:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DARE co-evolves difficulty estimation and policy in RL for LLMs to improve training efficiency, final performance, and inference speed by using tailored strategies for different difficulty levels.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Knapsack RL: Unlocking exploration of LLMs via optimizing budget allocation. arXiv preprint arXiv:2509.25849, 2025. URLhttps://arxiv.org/abs/2509.25849. [29] Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step. InThe twelfth international conference on learning representations, 2023. [30] Jiawei Liu, Chunqiu Steven Xia, Yuyao Wang, and Lingming Zhang. Is your code generated by ChatGPT really correct? rigorous evaluation of large language models for code generation. In Advances in Neural Information Processing Systems, 2023. URL https://arxiv.org/abs/ 2305.01210. [31] Shicheng Liu and Minghui Zhu. Leveraging explanation to improve generalization of meta rein-"},{"citing_arxiv_id":"2605.08817","ref_index":24,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"How You Begin is How You Reason: Driving Exploration in RLVR via Prefix-Tuned Priors","primary_cat":"cs.AI","submitted_at":"2026-05-09T09:10:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IMAX trains soft prefixes with an InfoMax reward to drive diverse exploration in RLVR, yielding up to 11.60% gains in Pass@4 over standard RLVR across model scales.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"00 23.1663.55 51.76 60.20 41.10 11.40 32.24 30.45 optimization, we use AdamW with a learning rate of 5×10 −4. The weight for the InfoMax reward is set to be 0.01. More implementation details are reported in Appendix D. Evaluation Details.For evaluation, we use lighteval 2 to test the algorithms on mathematical reasoning benchmarks including MATH-500 [24], GSM8K [8], Minerva Math [22], and AMC [15], and to show that our tuned prefixes do not degrade performance on general tasks, we also evaluate on an instruction following task-IFEval [53]. For prefix-tuned methods, evaluation uses the same stratified prefix assignment paradigm as in training: each prompt is paired with distinct C learned prefixes and the same number of rollouts Neval is generated for each prompt-prefix pair."},{"citing_arxiv_id":"2605.08715","ref_index":28,"ref_count":4,"confidence":0.55,"is_internal_anchor":false,"paper_title":"AgentForesight: Online Auditing for Early Failure Prediction in Multi-Agent Systems","primary_cat":"cs.CL","submitted_at":"2026-05-09T05:55:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AgentForesight introduces an online auditor model that predicts decisive errors in multi-agent trajectories at the earliest step using a coarse-to-fine reinforcement learning recipe on a new curated dataset AFTraj-2K.","context_count":2,"top_context_role":"background","top_context_polarity":"background","context_text":"In-the-flow agentic system optimization for effective planning and tool use.arXiv preprint arXiv:2510.05592, 2025. [27] Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step. InThe twelfth international conference on learning representations, 2023. [28] Bang Liu, Xinfeng Li, Jiayi Zhang, Jinlin Wang, Tanjin He, Sirui Hong, Hongzhang Liu, Shaokun Zhang, Kaitao Song, Kunlun Zhu, et al. Advances and challenges in foundation agents: From brain-inspired intelligence to evolutionary, collaborative, and safe systems.arXiv preprint arXiv:2504.01990, 2025. [29] Jiawei Liu, Chunqiu Steven Xia, Yuyao Wang, and Lingming Zhang."},{"citing_arxiv_id":"2605.08037","ref_index":20,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Beyond Pairs: Your Language Model is Secretly Optimizing a Preference Graph","primary_cat":"cs.LG","submitted_at":"2026-05-08T17:26:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GraphDPO generalizes pairwise DPO to a graph-structured Plackett-Luce objective over DAGs induced by rollout rankings, enforcing transitivity with linear complexity and recovering DPO as a special case.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"claims: (i) respecting partial orders through pref- erence graph construction improves over strict listwise objectives; (ii) modeling full transitive structure yields stronger alignment than pair- wise reductions; and (iii) ground-truth anchoring provides additional stability, particularly under limited rollout budgets. 5.2 MATH-500: Advanced Reasoning We evaluate on MATH-500 [ 20] using exact-match accuracy, focusing on generalization from GSM8K-style supervision to substantially harder multi-step problems. While the supervision signal remains binary, the distribution of rollouts becomes significantly more diverse due to longer reasoning chains and more complex failure modes. Comparison to baselines.As shown in Table 1, GraphDPO achieves 87."},{"citing_arxiv_id":"2605.06785","ref_index":17,"ref_count":2,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Distributional Process Reward Models: Calibrated Prediction of Future Rewards via Conditional Optimal Transport","primary_cat":"cs.LG","submitted_at":"2026-05-07T18:00:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Conditional optimal transport is used to turn raw PRM outputs into monotonic quantile functions that improve calibration and downstream Best-of-N performance on MATH-500 and AIME.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Nicholas Schiefer, Zac Hatfield-Dodds, Nova DasSarma, Eli Tran-Johnson, et al. Language models (mostly) know what they know.arXiv preprint arXiv:2207.05221, 2022. 10 [15] Diederik P Kingma and Jimmy Ba. Adam: A method for stochastic optimization.arXiv preprint arXiv:1412.6980, 2014. [16] Roger Koenker and Gilbert Bassett Jr. Regression quantiles.Econometrica: journal of the Econometric Society, pages 33-50, 1978. [17] Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step. InThe twelfth international conference on learning representations, 2023. [18] Hao Liu, Zi-Yi Dou, Yixin Wang, Nanyun Peng, and Yisong Yue. Uncertainty calibration for tool-using language agents."},{"citing_arxiv_id":"2605.06200","ref_index":15,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"A$^2$TGPO: Agentic Turn-Group Policy Optimization with Adaptive Turn-level Clipping","primary_cat":"cs.CL","submitted_at":"2026-05-07T13:09:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A²TGPO improves RL policy optimization for multi-turn agentic LLMs by normalizing information gain within same-depth turn groups, rescaling cumulative advantages by sqrt of term count, and modulating clipping ranges per turn's normalized IG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05007","ref_index":36,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Uno-Orchestra: Parsimonious Agent Routing via Selective Delegation","primary_cat":"cs.AI","submitted_at":"2026-05-06T15:07:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A learned orchestration policy for LLM agents that jointly optimizes task decomposition and selective routing to (model, primitive) pairs, delivering 77% macro pass@1 at 10x lower cost than strong baselines across 13 benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13602","ref_index":7,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Reward Hacking in the Era of Large Models: Mechanisms, Emergent Misalignment, Challenges","primary_cat":"cs.LG","submitted_at":"2026-04-15T08:11:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper introduces the Proxy Compression Hypothesis as a unifying framework explaining reward hacking in RLHF as an emergent result of compressing high-dimensional human objectives into proxy reward signals under optimization pressure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10701","ref_index":23,"ref_count":1,"confidence":0.55,"is_internal_anchor":false,"paper_title":"Bringing Value Models Back: Generative Critics for Value Modeling in LLM Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-12T15:54:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GenAC introduces generative critics with chain-of-thought reasoning and in-context conditioning to improve value approximation and downstream RL performance in LLMs compared to value-based and value-free baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}