{"total":10,"items":[{"citing_arxiv_id":"2605.22542","ref_index":20,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Scene Abstraction for Lexical Semantics: Structured Representations of Situated Meaning","primary_cat":"cs.CL","submitted_at":"2026-05-21T14:26:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Scene Abstraction framework builds structured scene representations for lexical meaning via LLM prompting, with COCA-Scenes dataset and human experiments showing 82.4% identification accuracy and 86.4% preference over ATOMIC baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20128","ref_index":49,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"MixRea: Benchmarking Explicit-Implicit Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-19T17:15:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MixRea benchmark reveals LLMs achieve at most 42.8% consistency on explicit-implicit reasoning tasks, with PRCP prompting proposed to recover overlooked relations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19723","ref_index":104,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Mathematical Reasoning in Large Language Models: Benchmarks, Architectures, Evaluation, and Open Challenges","primary_cat":"cs.CL","submitted_at":"2026-05-19T11:56:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A literature survey synthesizing benchmarks, architectures, training strategies, and evaluation methods for mathematical reasoning in LLMs, based on roughly 120 papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15393","ref_index":49,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"LPDS: Evaluating LLM Robustness Through Logic-Preserving Difficulty Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-14T20:26:59+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LPDS quantifies difficulty of logic-preserving problem variations and searches for the hardest ones, producing up to 5x larger performance drops than random sampling and better robustness gains from fine-tuning on difficult examples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.05643","ref_index":15,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Text-Graph Synergy: A Bidirectional Verification and Completion Framework for RAG","primary_cat":"cs.AI","submitted_at":"2026-05-07T03:49:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TGS-RAG adds graph-to-text re-ranking with global voting and text-to-graph orphan path bridging to improve precision and efficiency in multi-hop RAG over prior baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.22868","ref_index":59,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Probing Visual Planning in Image Editing Models","primary_cat":"cs.CV","submitted_at":"2026-04-23T19:00:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Image editing models fail zero-shot visual planning on abstract mazes and queen puzzles but generalize after finetuning, yet still cannot match human zero-shot efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19278","ref_index":52,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Explicit Trait Inference for Multi-Agent Coordination","primary_cat":"cs.AI","submitted_at":"2026-04-21T09:48:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ETI lets LLM agents infer and track partners' psychological traits (warmth and competence) from histories, cutting payoff loss 45-77% in games and boosting performance 3-29% on MultiAgentBench versus CoT baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17574","ref_index":77,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Beyond Fine-Tuning: In-Context Learning and Chain-of-Thought for Reasoned Distractor Generation","primary_cat":"cs.CL","submitted_at":"2026-04-19T18:29:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs prompted with few-shot examples and rationales generate better reasoned distractors for MCQs than fine-tuned contrastive models across six benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.20857","ref_index":117,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","primary_cat":"cs.CL","submitted_at":"2025-11-25T21:08:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Evo-Memory is a new streaming benchmark and evaluation framework for self-evolving memory in LLM agents, unifying over ten memory modules and introducing the ReMem pipeline for continual improvement on multi-turn and reasoning datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.21074","ref_index":21,"ref_count":1,"confidence":0.35,"is_internal_anchor":false,"paper_title":"CODI: Compressing Chain-of-Thought into Continuous Space via Self-Distillation","primary_cat":"cs.CL","submitted_at":"2025-02-28T14:07:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CODI compresses explicit CoT into continuous space via self-distillation and is the first implicit method to match explicit CoT performance on GSM8k at GPT-2 scale with 3.1x compression and 28.2% higher accuracy than prior implicit approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}