{"total":37,"items":[{"citing_arxiv_id":"2605.21748","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RankJudge: A Multi-Turn LLM-as-a-Judge Synthetic Benchmark Generator","primary_cat":"cs.CL","submitted_at":"2026-05-20T21:20:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RankJudge creates paired multi-turn conversations with isolated single-turn flaws to generate unambiguous benchmarks for LLM-as-a-judge systems across ML, biomedicine, and finance domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18630","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCICONVBENCH: Benchmarking LLMs on Multi-Turn Clarification for Task Formulation in Computational Science","primary_cat":"cs.AI","submitted_at":"2026-05-18T16:34:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SCICONVBENCH is a new benchmark evaluating LLMs on multi-turn disambiguation and inconsistency resolution for task formulation in computational science, with frontier models reaching only 52.7% success on fluid mechanics disambiguation cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17193","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-LLM Systems Exhibit Robust Semantic Collapse","primary_cat":"cs.MA","submitted_at":"2026-05-16T23:29:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Closed-loop multi-LLM systems exhibit robust semantic collapse across model families and interventions, consistent with intrinsic properties of autoregressive generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14558","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Resolving Action Bottleneck: Agentic Reinforcement Learning Informed by Token-Level Energy","primary_cat":"cs.LG","submitted_at":"2026-05-14T08:33:02+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ActFocus resolves the action bottleneck in agentic RL by reweighting token gradients toward action tokens using observed reward variance and an energy-based uncertainty term, outperforming PPO and GRPO by up to 65 percentage points.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12922","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Attention Closes: How LLMs Lose the Thread in Multi-Turn Interaction","primary_cat":"cs.AI","submitted_at":"2026-05-13T02:58:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Attention to goal tokens declines in multi-turn LLM interactions while residual representations often retain decodable goal information, and the gap between these predicts whether goal-conditioned behavior survives.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.12357","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"$\\delta$-mem: Efficient Online Memory for Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-12T16:31:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"δ-mem augments frozen LLMs with an 8x8 online memory state updated by delta-rule learning to generate low-rank attention corrections, delivering 1.10x average gains over the backbone and larger improvements on memory-heavy tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11317","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SOMA: Efficient Multi-turn LLM Serving via Small Language Model","primary_cat":"cs.CL","submitted_at":"2026-05-11T23:07:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SOMA estimates a local response manifold from early turns and adapts a small surrogate model via divergence-maximizing prompts and localized LoRA fine-tuning for efficient multi-turn serving.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Lora: Low-rank adaptation of large language models.ICLR, 1(2):3, 2022. [24] Jinwoo Jeong and Jeongseob Ahn. Accelerating llm serving for multi-turn dialogues with efficient resource management. InProceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2, pages 1-15, 2025. [25] Philippe Laban, Hiroaki Hayashi, Yingbo Zhou, and Jennifer Neville. Llms get lost in multi-turn conversation.arXiv preprint arXiv:2505.06120, 2025. [26] Huayang Li, Tian Lan, Zihao Fu, Deng Cai, Lemao Liu, Nigel Collier, Taro Watanabe, and Yixuan Su. Repetition in repetition out: Towards understanding neural text degeneration from the data perspective."},{"citing_arxiv_id":"2605.10481","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Safe Multi-Agent Behavior Must Be Maintained, Not Merely Asserted: Constraint Drift in LLM-Based Multi-Agent Systems","primary_cat":"cs.MA","submitted_at":"2026-05-11T12:43:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Safety constraints in LLM-based multi-agent systems commonly weaken during execution through memory, communication, and tool use, requiring them to be maintained as explicit state rather than asserted once.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[16] LangChain. Workflows and agents. https://docs.langchain.com/oss/python/ langgraph/workflows-agents, 2026. LangGraph documentation. Accessed: 2026-05-06. [17] Lauro Langosco, Jack Koch, Lee Sharkey, Jacob Pfau, Laurent Orseau, and David Krueger. Goal misgeneralization in deep reinforcement learning, 2023. URL https://arxiv.org/ abs/2105.14111. [18] Donghyun Lee and Mo Tiwari. Prompt infection: Llm-to-llm prompt injection within multi- agent systems, 2024. URLhttps://arxiv.org/abs/2410.07283. [19] Hunter Lightman, Vineet Kosaraju, Yura Burda, Harri Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step, 2023. URL https://arxiv.org/abs/2305."},{"citing_arxiv_id":"2605.18799","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReCrit: Transition-Aware Reinforcement Learning for Scientific Critic Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-11T09:22:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ReCrit frames critic interaction as a correctness-transition problem and uses quadrant-based RL rewards to improve LLM performance on scientific reasoning benchmarks by rewarding corrections and robustness while penalizing sycophancy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09808","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Quantifying the Utility of User Simulators for Building Collaborative LLM Assistants","primary_cat":"cs.CL","submitted_at":"2026-05-10T23:06:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Fine-tuned simulators grounded in real human data produce LLM assistants that win more often against real users than those trained against role-playing simulators.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"with other simulators at test time, while the one trained against fine-tuned simulator does. Together, these results argue for grounding user simulators in real human behavior and measuring their quality by their downstream effect on real users. 2 1 Introduction A good AI assistant should interact effectively with human users across multiple turns of conversation [1, 2]. Training and evaluating such assistants require multi-turn human-AI interactions, but access to human users is often limited [ 3]. Recent works have therefore explored building LLM-based user simulators, where the simulator produces the user's turns in a conversation with the assistant [4]. Simulation offers data collection that is scalable, cost-effective, and free from safety risks of"},{"citing_arxiv_id":"2605.04180","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MedFabric and EtHER: A Data-Centric Framework for Word-Level Fabrication Generation and Detection in Medical LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-05T18:19:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MedFabric dataset and EtHER detector achieve over 15% better word-level fabrication detection in medical LLMs than prior methods by generating stylistically faithful errors and using decomposition-based checking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03261","ref_index":133,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Can AI Help You Get Over Your Breakup? One Session with a Belief-Reframing Chatbot Shows Sustained Distress Reduction","primary_cat":"cs.HC","submitted_at":"2026-05-05T01:22:20+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A pre-registered RCT found that one session with a belief-reframing AI chatbot produced significantly greater reductions in breakup distress than a survey-only control at 7 days, with a smaller effect persisting at 1 month.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23051","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evaluating Temporal Consistency in Multi-Turn Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-24T22:44:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Language models frequently violate temporal scope stability in multi-turn dialogues by drifting toward present-day assumptions even when they possess the correct facts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22452","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Superminds Test: Actively Evaluating Collective Intelligence of Agent Society via Probing Agents","primary_cat":"cs.AI","submitted_at":"2026-04-24T11:11:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Large-scale experiments on two million agents reveal that collective intelligence does not emerge from scale alone due to sparse and shallow interactions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19656","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Pause or Fabricate? Training Language Models for Grounded Reasoning","primary_cat":"cs.CL","submitted_at":"2026-04-21T16:45:29+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRIL uses stage-specific RL rewards to train LLMs to detect missing premises, pause proactively, and resume grounded reasoning after clarification, yielding up to 45% better premise detection and 30% higher task success on insufficient math datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18543","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ClawEnvKit: Automatic Environment Generation for Claw-Like Agents","primary_cat":"cs.AI","submitted_at":"2026-04-20T17:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ClawEnvKit automates generation of diverse verified environments for claw-like agents from natural language, producing the Auto-ClawEval benchmark of 1,040 environments that matches human-curated quality at 13,800x lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17377","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AnchorMem: Anchored Facts with Associative Contexts for Building Memory in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-19T11:02:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AnchorMem decouples atomic fact anchors and associative event graphs for retrieval from preserved raw interaction contexts, outperforming prior memory methods on the LoCoMo benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17301","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"RoTRAG: Rule of Thumb Reasoning for Conversation Harm Detection with Retrieval-Augmented Generation","primary_cat":"cs.CL","submitted_at":"2026-04-19T07:35:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RoTRAG retrieves Rules of Thumb to ground LLM reasoning for harm detection and severity classification in multi-turn dialogues, reporting roughly 40% relative F1 gains and 8.4% lower distributional error on two safety benchmarks while cutting redundant retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17091","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"GenericAgent: A Token-Efficient Self-Evolving LLM Agent via Contextual Information Density Maximization (V1.0)","primary_cat":"cs.CL","submitted_at":"2026-04-18T17:59:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GenericAgent outperforms other LLM agents on long-horizon tasks by maximizing context information density with fewer tokens via minimal tools, on-demand memory, trajectory-to-SOP evolution, and compression.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[10] Lost in the middle: How language models use long contexts.https://arxiv.org/abs/2307.03172, 2023. [11] Chenxin An, Jun Zhang, Ming Zhong, Lei Li, Shansan Gong, Yao Luo, Jingjing Xu, and Lingpeng Kong. Why does the effective context length of llms fall short? InThe Thirteenth International Conference on Learning Representations, 2025. ICLR 2025 Poster. [12] Llms get lost in multi-turn conversation.https://arxiv.org/abs/2505.06120, 2025. [13] Aditya Rajasekaran et al. Effective context engineering for ai agents.https://www.anthropic.com/engineering/ effective-context-engineering-for-ai-agents, September 2025. [14] Prateek Chhikara, Dev Khant, Saket Aryan, Taranjeet Singh, and Deshraj Yadav. Mem0: Building production-"},{"citing_arxiv_id":"2604.16804","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AutoOR: Scalably Post-training LLMs to Autoformalize Operations Research Problems","primary_cat":"cs.LG","submitted_at":"2026-04-18T03:24:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AutoOR uses synthetic data generation and RL post-training with solver feedback to enable 8B LLMs to autoformalize linear, mixed-integer, and non-linear OR problems, matching larger models on benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"AutoOR: Scalably Post-training LLMs to Autoformalize Operations Research Problems language models, 2024. URL https:// arxiv.org/abs/2406.10305. [58] Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, Daniel Fried, Gabriel Synnaeve, Rishabh Singh, and Sida I. Wang. Swe- rl: Advancing llm reasoning via reinforce- ment learning on open software evolution, 2025. URL https://arxiv.org/abs/ 2502.18449. [59] Cansu Sancaktar, David Zhang, Gabriel Syn- naeve, and Taco Cohen. A deep dive into scaling rl for code generation with synthetic data and curricula, 2026. URL https: //arxiv.org/abs/2603.24202. [60] Dulhan Jayalath, Shashwat Goel, Thomas Foster, Parag Jain, Suchin Gururangan, Cheng Zhang, Anirudh Goyal, and Alan Schelten. Compute as teacher: Turning in-"},{"citing_arxiv_id":"2604.15625","ref_index":22,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ZORO: Active Rules for Reliable Vibe Coding","primary_cat":"cs.HC","submitted_at":"2026-04-17T02:06:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZORO integrates rules directly into AI coding workflows by enriching plans, enforcing compliance with proof requirements, and evolving rules via user feedback, resulting in better rule adherence and shifts in user behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15597","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLMs Corrupt Your Documents When You Delegate","primary_cat":"cs.CL","submitted_at":"2026-04-17T00:33:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs corrupt an average of 25% of document content during long delegated editing workflows across 52 domains, even frontier models, and agentic tools do not mitigate the issue.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18768","ref_index":165,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClinQueryAgent: A Conversational Agent for Population Health Management","primary_cat":"cs.IR","submitted_at":"2026-04-13T10:07:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces ClinQueryAgent, a conversational agent that converts natural language queries into database queries for population health management while keeping patient data secure, and reports its use by 128 staff across 15 NHS practices covering 148,319 patients.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08782","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MT-OSC: Path for LLMs that Get Lost in Multi-Turn Conversation","primary_cat":"cs.CL","submitted_at":"2026-04-09T21:39:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MT-OSC condenses chat history via a one-off sequential process with a few-shot Condenser and lightweight Decider to reduce tokens and preserve LLM accuracy in multi-turn settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07864","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ZeroCoder: Can LLMs Improve Code Generation Without Ground-Truth Supervision?","primary_cat":"cs.SE","submitted_at":"2026-04-09T06:24:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ZeroCoder co-evolves coder and tester LLMs via self-generated code-test execution feedback to improve code generation up to 21.6% without ground-truth supervision.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"pirically effective, matrix rank is only an approximation and may not fully characterize all aspects of supervision quality. For exam- ple, two interaction matrices with the same rank may still differ substantially in the usefulness of the rewards they induce. Future work could explore more expressive characterizations of interac- tion structure, such as covariance-based statistics [18] and spectral properties [31] of the passing matrix. 8 Conclusion We present ZeroCoder, a label-free co-evolutionary framework that improves code generation and test generation from self-generated code-test interactions. By converting passing matrices into role- specific rewards, ZeroCoder enables the coder and tester to improve without relying on ground-truth supervision."},{"citing_arxiv_id":"2604.07549","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"EMSDialog: Synthetic Multi-person Emergency Medical Service Dialogue Generation from Electronic Patient Care Reports via Multi-LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-04-08T19:52:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EMSDialog is a dataset of 4,414 synthetic multi-speaker EMS dialogues generated by a multi-LLM agent pipeline grounded in ePCR reports, annotated with diagnoses, roles, and topics, and shown to improve accuracy, timeliness, and stability in conversational diagnosis prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.05552","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Context-Agent: Dynamic Discourse Trees for Non-Linear Dialogue","primary_cat":"cs.CL","submitted_at":"2026-04-07T07:54:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Context-Agent represents dialogue history as a dynamic tree to handle non-linear topic shifts and introduces the NTM benchmark for evaluating long-horizon non-linear dialogues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22773","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Trace Mutation in Human-LLM Dialogue: The Transcript as Forensic and Mitigation Surface","primary_cat":"cs.HC","submitted_at":"2026-03-31T03:05:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Trace mutations are a class of context failures in LLM conversations consisting of utterance effacement and genitive dissociation that distort the shared record while resisting ordinary repair.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13061","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Token Statistics Reveal Conversational Drift in Multi-turn LLM Interaction","primary_cat":"cs.CL","submitted_at":"2026-03-18T18:10:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bipredictability from token statistics monitors structural consistency in multi-turn LLM interactions, showing 85% alignment with structure but only 44% with semantics and 100% sensitivity to tested drifts across 4574 turns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.16746","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SWE-Pruner: Self-Adaptive Context Pruning for Coding Agents","primary_cat":"cs.SE","submitted_at":"2026-01-23T13:51:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWE-Pruner trains a lightweight neural skimmer to perform task-aware pruning of code contexts for LLM agents, delivering 23-54% token reduction on SWE-Bench Verified with improved success rates and up to 14.84x compression on LongCodeQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.23578","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Style Amnesia: Investigating Speaking Style Degradation and Mitigation in Multi-Turn Spoken Language Models","primary_cat":"cs.CL","submitted_at":"2025-12-29T16:23:54+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Spoken language models exhibit style amnesia and fail to maintain instructed paralinguistic styles across multi-turn conversations, with explicit recall offering partial mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.10931","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Asynchronous Reasoning: Training-Free Interactive Thinking LLMs","primary_cat":"cs.LG","submitted_at":"2025-12-11T18:57:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Using properties of positional embeddings, reasoning LLMs can be made to think, listen, and generate outputs asynchronously without any additional training, cutting time to first token to under 5 seconds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.05921","ref_index":2,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Prompt reinforcing for long-term planning of large language models","primary_cat":"cs.CL","submitted_at":"2025-10-07T13:30:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A prompt optimization framework uses turn-by-turn feedback and experience replay to rewrite task instructions, yielding reported gains on multi-turn text-to-SQL and task-oriented dialogue tasks across different LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.11295","ref_index":75,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Prompt Engineering Report Distilled: Quick Start Guide for Life Sciences","primary_cat":"cs.CL","submitted_at":"2025-09-14T14:39:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"The paper reduces a broad set of prompt engineering techniques to six core approaches and applies them to life sciences use cases while addressing common LLM pitfalls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.15815","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"User-Assistant Bias in LLMs","primary_cat":"cs.CL","submitted_at":"2025-08-16T20:33:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs show strong user bias in role-tagged contexts that is amplified by preference alignment and can be reduced or controlled through targeted fine-tuning and DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.11198","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Temperature and Persona Shape LLM Agent Consensus With Minimal Accuracy Gains in Qualitative Coding","primary_cat":"cs.CL","submitted_at":"2025-07-15T11:06:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Temperature and persona variations shape consensus speed in LLM multi-agent coding but produce no robust accuracy gains over single agents on human-annotated tutoring transcripts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.19630","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Real-World Doctor Agent with Proactive Consultation through Multi-Agent Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2025-05-26T07:48:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DoctorAgent-RL trains a Qwen2.5-7B doctor agent via multi-agent RL on the new MTMedDialog dataset to conduct dynamic, question-driven consultations, reaching 70% exact diagnostic match in real-patient trials.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}