{"total":13,"items":[{"citing_arxiv_id":"2606.30775","ref_index":180,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Single Rewrite Suffices: Empirical Lessons from Production Skill Description Optimization","primary_cat":"cs.CL","submitted_at":"2026-06-29T18:06:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A single LLM rewrite of skill descriptions using false positive and negative cases matches manual optimization performance in production, with most other pipeline components adding little value.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28187","ref_index":5,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GBC: Gradient-Based Connections for Optimizing Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-06-26T15:32:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GBC treats multi-agent LLM workflows as differentiable graphs to enable token-level attribution and targeted optimization, with reported gains on MultiWOZ and τ-bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.27243","ref_index":18,"ref_count":2,"confidence":0.88,"is_internal_anchor":false,"paper_title":"NOVA: A Verification-Aware Agent Harness for Architecture Evolution in Industrial Recommender Systems","primary_cat":"cs.IR","submitted_at":"2026-06-25T16:30:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NOVA introduces a level-aware agent harness with architecture gradient and verification cascade to automate recommender architecture evolution while reducing silent failures and human effort.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18889","ref_index":46,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Improving Medical Communication using Rubric-Guided Counterfactual Recommendations","primary_cat":"cs.CL","submitted_at":"2026-06-17T10:07:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"An LM-guided counterfactual pipeline recommends minimal ordinal changes to communication features like tone and actionability, yielding a mean +6.41% gain in predicted positive feedback under independent auditor models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09421","ref_index":11,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"What Should a Skill Remember? Quality--Cost Trade-offs in Cost-Aware Skill Rewriting for Language Model Agents","primary_cat":"cs.CL","submitted_at":"2026-06-08T12:36:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Empirical study demonstrates that cost-aware skill rewriting for LLM agents can achieve 7% total cost reduction and 6% agent-token cost reduction with preserved quality on SkillsBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04661","ref_index":15,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CRAFT: Cost-aware Refinement And Front-aware Tuning of Prompts","primary_cat":"cs.CL","submitted_at":"2026-06-03T09:40:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CRAFT is a Pareto-front prompt optimizer that allocates scarce LLM validation calls to candidates near the current front using accuracy- and cost-oriented generators plus NSGA-II retention.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28882","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GrowLoop: Self-Evolving Conversation Evaluation Seeded by Human","primary_cat":"cs.CL","submitted_at":"2026-05-26T16:53:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GrowLoop proposes a human-seeded self-evolving framework that co-evolves rubrics and cases to evaluate conversational human-likeness with differentiated agreement rules.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18113","ref_index":34,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"iPOE: Interpretable Prompt Optimization via Explanations","primary_cat":"cs.CL","submitted_at":"2026-05-18T09:21:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"iPOE generates and optimizes annotation guidelines from explanations to produce interpretable prompts, reporting up to 39% gains over baselines on four datasets with LLM explanations substituting for human ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16551","ref_index":2,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"PQR: A Framework to Generate Diverse and Realistic User Queries that Elicit QA Agent Failures","primary_cat":"cs.CL","submitted_at":"2026-05-15T18:50:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PQR framework generates diverse realistic queries to elicit QA agent failures, uncovering 23-78% more unhelpful responses than prior methods in e-commerce agent tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04107","ref_index":19,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"TSCG: Deterministic Tool-Schema Compilation for Agentic LLM Deployments","primary_cat":"cs.SE","submitted_at":"2026-05-04T15:35:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TSCG compiles JSON tool schemas into token-efficient structured text, raising tool-use accuracy for small LLMs from 0% to 84.4% on benchmarks while cutting tokens by 52-57%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19821","ref_index":13,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"JTPRO: A Joint Tool-Prompt Reflective Optimization Framework for Language Agents","primary_cat":"cs.AI","submitted_at":"2026-04-20T05:37:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JTPRO co-optimizes prompts and tool descriptions via reflection to raise overall success rate by 5-20% over baselines on multi-tool benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.20249","ref_index":17,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Unified Multimodal Brain Decoding via Cross-Subject Soft-ROI Fusion","primary_cat":"cs.LG","submitted_at":"2025-12-23T11:04:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BrainROI achieves leading cross-subject brain-captioning results on NSD by combining multi-atlas soft-ROI fusion with interpretable prompt optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.07691","ref_index":106,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ORPO: Monolithic Preference Optimization without Reference Model","primary_cat":"cs.CL","submitted_at":"2024-03-12T14:34:08+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"ORPO performs preference alignment during supervised fine-tuning via a monolithic odds ratio penalty, allowing 7B models to outperform larger state-of-the-art models on alignment benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}