{"total":210,"items":[{"citing_arxiv_id":"2606.27251","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Advancing Omnimodal Embodied Agents from Isolated Skills to Everyday Physical Autonomy","primary_cat":"cs.RO","submitted_at":"2026-06-25T16:36:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"OmniAct framework integrates planning, memory, and verification to enable persistent autonomy in omnimodal embodied agents, showing improved success and stable context in 40 real-world tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19847","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AtomMem: Building Simple and Effective Memory System for LLM Agents via Atomic Facts","primary_cat":"cs.CL","submitted_at":"2026-06-18T06:56:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"AtomMem introduces atomic-fact extraction, hierarchical event structures, and an associative memory graph to build stable long-term memory for LLM agents, claiming SOTA results on the LoCoMo benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19144","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Human-AI Coevolution Dynamics: A Formal Theory of Social Intelligence Emergence Through Long-Term Interaction","primary_cat":"cs.AI","submitted_at":"2026-06-17T14:47:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes HACD-H framework integrating emotional adaptation, relational organization, memory and personality into a dynamical system and reports empirical patterns from a 14,700-turn dataset linking social intelligence to reduced social cognitive energy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18406","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CoreMem: Riemannian Retrieval and Fisher-Guided Distillation for Long-Term Memory in Dialogue Agents","primary_cat":"cs.CL","submitted_at":"2026-06-16T18:56:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoreMem replaces cosine retrieval with Fisher-Rao Riemannian matching and introduces Fisher-guided discrete token distillation for syntax-aware compression, reporting +4.51 pp open-domain and +4.17 pp temporal gains on LOCOMO and LongMemEval-S while staying inside an 8 GB VRAM budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17162","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemSlides: A Hierarchical Memory Driven Agent Framework for Personalized Slide Generation with Multi-turn Local Revision","primary_cat":"cs.CL","submitted_at":"2026-06-15T18:02:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MemSlides introduces a three-part memory hierarchy (user profile, working, tool) with scoped local revision for multi-turn personalized slide generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13177","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemRefine: LLM-Guided Compression for Long-Term Agent Memory","primary_cat":"cs.CL","submitted_at":"2026-06-11T10:46:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemRefine uses LLM factual judgments to iteratively compress agent memory to target budgets while preserving downstream task performance better than rule-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11051","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Making Software Meaningful","primary_cat":"cs.SE","submitted_at":"2026-06-09T16:16:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Committing to explicit meaning via a domain-grounded vocabulary of individuals, actions, facts, and concepts improves software usability, enables modular LLM code generation, and supports accountable agent behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10749","ref_index":138,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Toward Secure LLM Agents: Threat Surfaces, Attacks, Defenses, and Evaluation","primary_cat":"cs.CR","submitted_at":"2026-06-09T12:01:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A synthesis of 247 papers on LLM agent security identifies prompt injection and tool hijacking as dominant threats, notes weakly compositional defenses, and argues for trust boundaries and realistic evaluations.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[137] Zhenyu Pan, Yiting Zhang, Zhuo Liu, Yolo Yunlong Tang, Zeliang Zhang, Haozheng Luo, Yuwei Han, Jianshu Zhang, Dennis Wu, Hong-Yu Chen, Haoran Lu, Haoyang Fang, Manling Li, Chenliang Xu, Philip S. Yu, and Han Liu. 2025. AdvEvo-MARL: Shaping Internalized Safety through Adversarial Co-Evolution in Multi-Agent Reinforcement Learning. arXiv:2510.01586 [cs.AI] doi:10.48550/arXiv.2510.01586 [138] Vaidehi Patil, Elias Stengel-Eskin, and Mohit Bansal. 2025. The Sum Leaks More Than Its Parts: Compositional Privacy Risks and Mitigations in Multi-Agent Collaboration. arXiv:2509.14284 [cs.CR] doi:10.48550/arXiv.2509.14284 [139] Atharv Singh Patlan, Peiyao Sheng, S. Ashwin Hebbar, Prateek Mittal, and Pramod Viswanath. 2025. Real AI Agents with Fake Memories: Fatal Context Manipulation Attacks on Web3 Agents."},{"citing_arxiv_id":"2606.10532","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ActiveMem: Distributed Active Memory for Long-Horizon LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-06-09T08:03:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ActiveMem proposes a heterogeneous distributed memory framework for LLM agents that separates planning from active memory management, reporting SOTA accuracy with lower overhead on BrowseComp-Plus and GAIA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10106","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What makes a harness a harness: necessary and sufficient conditions for an agent harness","primary_cat":"cs.SE","submitted_at":"2026-06-08T19:35:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proposes and tests a constitutive definition of 'agent harness' via conceptual analysis of literature and six real systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08950","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When More Cores Hurts: The Vector Database Scaling Paradox in HPC","primary_cat":"cs.DC","submitted_at":"2026-06-08T02:51:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Large-scale HPC evaluation of Qdrant, Milvus, and Weaviate reveals that workload patterns limit scaling and extra cores can reduce throughput, exposing a cloud-to-HPC design mismatch.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05023","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Scaling Expert Feedback with Reflective Edit Propagation in Compositional Knowledge Bases","primary_cat":"cs.HC","submitted_at":"2026-06-03T15:47:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RAID is a reflective agent system that infers intent from single expert edits and propagates corrections across compositional knowledge bases through a three-step architecture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01139","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillRevise: Improving LLM-Authored Agent Skills via Trace-Conditioned Skill Revision","primary_cat":"cs.AI","submitted_at":"2026-05-31T10:19:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillRevise iteratively refines initial LLM-generated agent skills using execution traces to diagnose defects and apply repairs, raising success rates from 36.05% to 61.63% on SkillsBench across three benchmarks and five LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00832","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Momento: Evaluating Persistent Memory and Reasoning with Multi-Session Agentic Conversations","primary_cat":"cs.CL","submitted_at":"2026-05-30T18:08:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Momento benchmark reveals current agents fail at multi-session tasks mainly by misestimating user state and treating old session history as current context instead of stale data needing re-validation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00728","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Empathy to Personalized Empathy: Adapting Empathetic Strategies to Individual Users","primary_cat":"cs.CL","submitted_at":"2026-05-30T13:49:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces personalized empathy task, PersonaEmp dataset from long-term interactions, and PereGRM reward framework that combines empathy evaluation with dynamic criteria for improved adaptation to user personas.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00619","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemPro: Agentic Memory Systems as Evolvable Programs","primary_cat":"cs.CL","submitted_at":"2026-05-30T08:47:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemPro evolves the entire MCR pipeline as runnable programs via failure-guided refinement on a version tree and outperforms static baselines on LongMemEval, LoCoMo, HotpotQA, and NarrativeQA.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30771","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Eywa: Provenance-Grounded Long-Term Memory for AI Agents","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Eywa introduces a provenance-grounded memory system for persistent AI agents featuring evidence-first storage, typed validation, and deterministic multi-route retrieval, reporting 90.19% accuracy on LoCoMo and 88.2% on LongMemEval-S.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30434","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LongDS-Bench: On the Failure of Long-Horizon Agentic Data Analysis","primary_cat":"cs.LG","submitted_at":"2026-05-28T18:00:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LongDS benchmark shows state-of-the-art agents achieve only 48.45% accuracy on long-horizon data analysis tasks, with performance dropping 47 points from early to late turns and state-maintenance errors causing most failures.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30260","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How LoRA Remembers? A Parametric Memory Law for LLM Finetuning","primary_cat":"cs.CL","submitted_at":"2026-05-28T17:22:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces Parametric Memory Law as power law for LoRA memory capacity and MemFT threshold-guided optimization for better memory fidelity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30159","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Meta-Cognitive Memory Policy Optimization for Long-Horizon LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-28T16:17:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MMPO introduces Belief Entropy as a self-supervised signal to provide fine-grained supervision for memory policies in LLM agents, outperforming outcome-based RL on long-horizon tasks up to 1.75M tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30087","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Selective QA over Conflicting Multi-Source Personal Memory: A Diagnostic Testbed and Method Comparison","primary_cat":"cs.AI","submitted_at":"2026-05-28T15:33:39+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a benchmark with 34,560 instances for selective QA over conflicting multi-source personal memory and compares fusion methods against LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29960","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hijacking Agent Memory: Stealthy Trojan Attacks Through Conversational Interaction","primary_cat":"cs.CR","submitted_at":"2026-05-28T14:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemPoison enables stealthy memory poisoning in LLM agents via dialogue by using semantic relational bridges, entity masquerading, and joint embedding optimization to bypass selective extraction and rewriting, achieving up to 0.95 attack success rate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29734","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HTAM: Hierarchical Transition-Attended Memory for Operator Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-28T10:29:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTAM builds a Hierarchical Transition Graph to organize coarse global directions and detailed local strategies for guiding LLM-based CUDA kernel optimization, improving results on KernelBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29668","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GRASP: Gated Regression-Aware Skill Proposer for Self-Improving LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-28T09:30:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GRASP adds a regression-aware acceptance gate to skill proposal for LLM agents, producing large gains on clinical benchmarks while preventing silent regressions on prior behavior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29640","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VikingMem: A Memory Base Management System for Stateful LLM-based Applications","primary_cat":"cs.AI","submitted_at":"2026-05-28T09:07:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"VikingMem implements the Memory Base paradigm via event-centric extraction and entity updates on VikingDB with temporal compression, claiming up to 30% better retrieval effectiveness on long-term memory benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29630","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Entity-Collision: A Stratified Protocol for Attributing Retrieval Lift in Agent Memory","primary_cat":"cs.CL","submitted_at":"2026-05-28T09:02:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entity-collision protocol stratifies agent-memory retrieval tests by tag and pins BM25 floor via shared entity tokens to attribute lift specifically to embedders.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29368","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SURGENT: A Surgical Multi-Agent Assistance System Across the Perioperative Workflow","primary_cat":"cs.CL","submitted_at":"2026-05-28T05:12:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SURGENT is a multi-agent surgical assistance system with novel memory management that outperforms baseline LLMs on case analysis, plan simulation, safety monitoring, risk assessment, and rehabilitation guidance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29313","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PatchBoard: Schema-Grounded State Mutation for Reliable and Auditable LLM Multi-Agent Collaboration","primary_cat":"cs.CL","submitted_at":"2026-05-28T03:43:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PatchBoard introduces schema-grounded JSON Patch state mutations with an Architect agent and validation kernel, reporting 84.6% success and lower token use on 630 ALFWorld episodes versus LangGraph and Flock baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28773","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Rethinking Memory as Continuously Evolving Connectivity","primary_cat":"cs.CL","submitted_at":"2026-05-27T17:35:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FluxMem evolves memory as a heterogeneous graph via three refinement stages and reports consistent state-of-the-art results on LoCoMo, Mind2Web, and GAIA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28062","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ConvMemory: A Lightweight Learned Memory Reranker, a Negative Attribution Result, and a Research-Preview Conflict Editor","primary_cat":"cs.CL","submitted_at":"2026-05-27T07:14:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ConvMemory delivers competitive recall at far lower latency than larger rerankers for long-term conversational memory while a multi-seed ablation refutes temporal-structure exploitation as the operative mechanism.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27864","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FundaPod: A Multi-Persona Agent Pod Platform with Knowledge Graph Memory for AI-Assisted Fundamental Investment Research","primary_cat":"cs.AI","submitted_at":"2026-05-27T02:26:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"FundaPod presents a multi-persona AI agent architecture with knowledge-graph memory to support human-adjudicated fundamental investment research through independent agent work and verifiable evidence links.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27787","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Long Live the Librarian! A Persistent Search Sub-Agent for Energy-Efficient Multi-Agent Software Engineering Systems","primary_cat":"cs.MA","submitted_at":"2026-05-27T00:10:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Librarian reduces per-episode GPU energy use by up to 25% in existing multi-agent SWE systems on SWE-Bench Verified by tracking search history and minimizing redundant output tokens while preserving task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23574","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Push Your Agent: Measuring and Enforcing Quantitative Goal Persistence in Long-Horizon LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-05-22T12:44:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces QGP and PushBench to evaluate LLM agent persistence on quantitative goals, showing specialized controllers outperform baselines on verifier-checked artifact collection tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23067","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Training Data Teaches RL Memory Agents: An Empirical Study of Curriculum Effects in Memory-Augmented QA","primary_cat":"cs.CL","submitted_at":"2026-05-21T21:58:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Controlled study shows mixed training curricula improve aggregate F1 on memory QA benchmarks while out-of-domain data transfers targeted skills like temporal reasoning, with per-question-type effects exceeding aggregate differences.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22566","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GraphFlow: A Graph-Based Workflow Management for Efficient LLM-Agent Serving","primary_cat":"cs.LG","submitted_at":"2026-05-21T14:45:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"GraphFlow uses a unified wGraph to dynamically instantiate workflows and manage KV caches for LLM agents, reporting 4.95 pp average gains and 4x memory reduction on five benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22411","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeferMem: Query-Time Evidence Distillation via Reinforcement Learning for Long-Term Memory QA","primary_cat":"cs.CL","submitted_at":"2026-05-21T12:36:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeferMem decouples memory QA into high-recall retrieval and RL-based query-conditioned evidence distillation, outperforming baselines on LoCoMo and LongMemEval-S with highest accuracy, fastest runtime, and zero API token cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22343","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sibyl-AutoResearch: Autonomous Research Needs Self-Evolving Trial-and-Error Harnesses, Not Paper Generators","primary_cat":"cs.MA","submitted_at":"2026-05-21T11:29:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Sibyl-AutoResearch introduces self-evolving trial-and-error harnesses with auditable conversion units that link trial signals to updated research behaviors and harness repairs in autonomous systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22148","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Ratchet: A Minimal Hygiene Recipe for Self-Evolving LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-21T08:20:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ratchet provides a minimal hygiene recipe for self-managing skill libraries in frozen LLM agents, delivering +0.328 rolling-mean pass@1 gain on MBPP+ hard-100 and +0.22 peak lift on SWE-bench Verified.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21997","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"The Log is the Agent: Event-Sourced Reactive Graphs for Auditable, Forkable Agentic Systems","primary_cat":"cs.AI","submitted_at":"2026-05-21T04:55:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ActiveGraph inverts traditional agent frameworks by treating the append-only event log as the primary source of truth, from which the reactive graph is projected, yielding deterministic replay, forking, and lineage tracking.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21768","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Memory-R2: Fair Credit Assignment for Long-Horizon Memory-Augmented LLM Agents","primary_cat":"cs.LG","submitted_at":"2026-05-20T22:02:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Memory-R2 proposes LoGo-GRPO to fix unfair trajectory comparisons in RL training of memory-augmented LLM agents by combining global end-to-end rewards with local rerollouts from identical memory states.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21463","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mem-$\\pi$: Adaptive Memory through Learning When and What to Generate","primary_cat":"cs.CL","submitted_at":"2026-05-20T17:51:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mem-π is a framework using a dedicated model and decision-content decoupled RL to generate context-specific guidance on demand for LLM agents, outperforming retrieval baselines by over 30% on web navigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20926","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemConflict: Evaluating Long-Term Memory Systems Under Memory Conflicts","primary_cat":"cs.IR","submitted_at":"2026-05-20T09:11:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MemConflict provides a benchmark for testing LLM long-term memory systems under dynamic, static, and conditional conflicts involving temporal validity, factual correctness, and contextual applicability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20724","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CALMem : Application-Layer Dual Memory for Conversational AI","primary_cat":"cs.IR","submitted_at":"2026-05-20T05:23:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CALMem delivers virtually unbounded effective context for LLM conversations via an application-layer dual memory architecture with intra-session retrieval and token-adaptive injection.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20616","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Auto-Dreamer: Learning Offline Memory Consolidation for Language Agents","primary_cat":"cs.CL","submitted_at":"2026-05-20T02:03:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Auto-Dreamer trains an offline memory consolidator via GRPO on agent performance to abstract cross-session patterns, outperforming baselines by 7 points on ScienceWorld with 12x smaller memory and generalizing to ALFWorld and WebArena.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20315","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mix-Quant: Quantized Prefilling, Precise Decoding for Agentic LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:50:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Mix-Quant quantizes prefilling to NVFP4 and keeps BF16 for decoding in agentic LLMs, achieving up to 3x prefilling speedup while largely preserving task performance on long-context and agentic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19514","ref_index":37,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: The Turing-Completeness of Autoregressive Transformers Relies Heavily on Context Management","primary_cat":"cs.AI","submitted_at":"2026-05-19T08:12:33+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18747","ref_index":210,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Code as Agent Harness","primary_cat":"cs.CL","submitted_at":"2026-05-18T17:59:03+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey that organizes existing work on LLM-based agents around code as the central harness, structured in three layers of interfaces, mechanisms, and multi-agent scaling, with applications across domains and listed open challenges.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"memory supports multi-agent planning, testing, reviewing, and trajectory coordination. In this setting, the central challenge is no longer only retrieving relevant content, but controlling the granularity of sharing, preventing information flooding, and supporting bidirectional access between high-level decisions and fine- grained execution traces [210]. Accordingly, memory in multi-agent code generation increasingly resembles a shared blackboard or collaborative state graph rather than a purely individual storage unit [212, 213]. 3.2.6. Context Compaction and State Offloading Context compaction and state offloading are cross-cutting context-engineering mechanisms for memory in code-agent harnesses [214]."},{"citing_arxiv_id":"2605.18636","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPIKE: An Adaptive Dual Controller Framework for Cost-Efficient Long-Horizon Game Agents","primary_cat":"cs.CV","submitted_at":"2026-05-18T16:43:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPIKE dual-controller framework raises success rates 5-9 points and cuts tokens 55% in StarDojo agents by reusing strategic plans across stable segments and escalating only at detected events.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18284","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CommitDistill: A Lightweight Knowledge-Centric Memory Layer for Software Repositories","primary_cat":"cs.SE","submitted_at":"2026-05-18T12:14:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"CommitDistill is a deterministic, local-only prototype that extracts typed knowledge from git commits and evaluates retrieval performance against baselines on public repositories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17998","ref_index":10,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Verify-Gated Completion as Admission Control in a Governed Multi-Agent Runtime: A Bounded Architecture Case Study","primary_cat":"cs.SE","submitted_at":"2026-05-18T07:52:13+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"In a bounded multi-agent runtime case study, verify-gated completion produced 99.5% success on invoked verification events with packetized records, supporting only a narrow claim of inspectable and fail-closed decisions under observed conditions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}