{"total":93,"items":[{"citing_arxiv_id":"2606.01168","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking Economically: A Hierarchical Framework for Adaptive-Complexity Reasoning in LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-31T11:20:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HAB applies coarse-to-fine budgeting to LLM reasoning, predicting per-problem depth and learning intra-step token budgets via PPL comparisons and adaptive Pareto optimization, yielding higher accuracy and lower token use than standard CoT on GSM8K and MATH500.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00726","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent Reward Steering: An Adaptive Inference-Time Framework that Implicitly Promotes Cognitive Behaviors in Reasoning LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-30T13:38:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LRS trains a latent reward model on final-answer correctness to steer SAE states during inference, improving reasoning performance and implicitly encouraging better cognitive behaviors.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29511","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DynaGraph: Lightweight Multi-Model Interaction Framework via Dynamic Topological Reconfiguration","primary_cat":"cs.MA","submitted_at":"2026-05-28T07:33:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DynaGraph is a multi-model framework that multiplexes PEFT adapters on a shared base model with evaluator-driven dynamic topology reconfiguration and hierarchical self-healing to achieve near-72B performance on reasoning benchmarks using an 8B model while reducing latency and tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28600","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transformers Provably Learn to Internalize Chain-of-Thought","primary_cat":"cs.LG","submitted_at":"2026-05-27T15:17:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"L-layer transformers under Log-ICoT curriculum provably learn k-parity with poly(n) samples and log k stages, matching explicit CoT efficiency without inference overhead.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28070","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bridging the Detection-to-Abstention Gap in Reasoning Models under Insufficient Information","primary_cat":"cs.AI","submitted_at":"2026-05-27T07:28:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"JTS trains reasoning models via supervised warm-up and missing-premise RL to make an explicit answerability commitment that triggers early termination on unanswerable inputs, raising Abstention@Detection near saturation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18597","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Latent Action Reparameterization for Efficient Agent Inference","primary_cat":"cs.AI","submitted_at":"2026-05-18T16:07:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LAR learns a compact latent action space from trajectories that shortens the effective decision horizon for LLM agents, reducing token count and inference time while preserving task success.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20254","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Table QA via TableGrid Navigation and Progressive Inference Prompting","primary_cat":"cs.IR","submitted_at":"2026-05-18T12:00:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces TableGrid Navigation (TGN) and Progressive Inference Prompting (PIP) as training-free structured prompting frameworks that improve LLM performance on table question answering over baselines on TableBench and achieve SOTA on FeTaQa.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16117","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SGR: A Stepwise Reasoning Framework for LLMs with External Subgraph Generation","primary_cat":"cs.CL","submitted_at":"2026-05-15T16:02:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"SGR enhances LLM reasoning accuracy by generating external subgraphs from knowledge bases and guiding progressive inference over them, yielding consistent gains over baselines on benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15425","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Runtime-Structured Task Decomposition for Agentic Coding Systems","primary_cat":"cs.SE","submitted_at":"2026-05-14T21:16:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Runtime-structured task decomposition reduces retry costs in agentic coding systems by up to 51.7% versus monolithic prompts by rerunning only failed subtasks on two software engineering workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.09806","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LEAD: Length-Efficient Adaptive and Dynamic Reasoning for Large Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-10T23:05:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LEAD uses online adaptive mechanisms including Potential-Scaled Instability and symmetric efficiency rewards based on correct rollouts to achieve higher accuracy-efficiency scores with substantially shorter reasoning outputs than base models on math benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, 35:24824-24837, 2022. [2] Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, Sharan Narang, Aakanksha Chowdhery, and Denny Zhou. Self-consistency improves chain of thought reasoning in language models.arXiv preprint arXiv:2203.11171, 2022. [3] Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Claire Cui, Olivier Bousquet, Quoc Le, et al. Least-to-most prompting enables complex reasoning in large language models.arXiv preprint arXiv:2205.10625, 2022. [4] Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan."},{"citing_arxiv_id":"2605.11002","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MT-JailBench: A Modular Benchmark for Understanding Multi-Turn Jailbreak Attacks","primary_cat":"cs.CR","submitted_at":"2026-05-10T00:17:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MT-JailBench is a modular benchmark that standardizes evaluation of multi-turn jailbreaks to identify key success drivers and enable stronger combined attacks.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"Lima: Less is more for alignment.Advances in Neural Information Processing Systems, 36:55006-55021, 2023. [50] Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Claire Cui, Olivier Bousquet, Quoc Le, et al. Least-to-most prompting enables complex reasoning in large language models.arXiv preprint arXiv:2205.10625, 2022. [51] Weikang Zhou, Xiao Wang, Limao Xiong, Han Xia, Yingshuang Gu, Mingxu Chai, Fukang Zhu, Caishuang Huang, Shihan Dou, Zhiheng Xi, et al. Easyjailbreak: A unified framework for jailbreaking large language models.arXiv preprint arXiv:2403.12171, 2024. [52] Andy Zou, Long Phan, Justin Wang, Derek Duenas, Maxwell Lin, Maksym Andriushchenko, Rowan Wang,"},{"citing_arxiv_id":"2605.07461","ref_index":27,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Think-with-Rubrics: From External Evaluator to Internal Reasoning Guidance","primary_cat":"cs.CL","submitted_at":"2026-05-08T09:08:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Think-with-Rubrics has LLMs generate rubrics internally before responding, outperforming external rubric-as-reward baselines by 3.87 points on average across benchmarks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"in enterprise and api-driven llm applications.arXiv preprint arXiv:2603.04857, 2026. [26] Yaowei Zheng, Richong Zhang, Junhao Zhang, Yanhan Ye, and Zheyan Luo. Llamafactory: Unified efficient fine-tuning of 100+ language models. InProceedings of the 62nd annual meeting of the association for computational linguistics (volume 3: system demonstrations), pages 400-410, 2024. [27] Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Claire Cui, Olivier Bousquet, Quoc Le, et al. Least-to-most prompting enables complex reasoning in large language models.arXiv preprint arXiv:2205.10625, 2022. [28] Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, and"},{"citing_arxiv_id":"2605.07248","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PaT: Planning-after-Trial for Efficient Test-Time Code Generation","primary_cat":"cs.CL","submitted_at":"2026-05-08T05:09:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PaT defers planning until after failed trials in LLM code generation, enabling heterogeneous cheap-plus-powerful model setups that match large-model performance at roughly 69% lower cost.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"tion and shows that the PaT policy is a structured approach to allocating computational resources. Theorem 3(Optimal Generator cost under scaling laws).The cost of the optimal small model, c∗ s, that minimizes the asymptotic cost of the heterogeneous configuration is given by the following closed-form solution: c∗ s = min (\u0012 β·D L α \u0013 1 β+1 , cL ) (23) Proof. The asymptotic cost of the heterogeneous configuration is proportional to the coefficient cs + DL ps . Substituting the scaling law gives cs + DL αcβ s . To find the minimum, we take the derivative with respect toc s and set it to zero: 1− βDL α c−β−1 s = 0.(24) Solving for cs yields the unconstrained optimum. The final solution is capped at cL to respect the"},{"citing_arxiv_id":"2605.07244","ref_index":122,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Experience Sharing in Mutual Reinforcement Learning for Heterogeneous Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-08T05:01:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Mutual Reinforcement Learning allows heterogeneous LLMs to exchange experience through mechanisms like Peer Rollout Pooling, Cross-Policy GRPO Advantage Sharing, and Success-Gated Transfer, with outcome-level sharing identified as favorable on the stability-support trade-off.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07180","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Learning Agent Routing From Early Experience","primary_cat":"cs.CL","submitted_at":"2026-05-08T03:18:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BoundaryRouter routes queries to LLM or agent using early experience memory from a seed set, cutting inference time 60.6% versus always using agents and raising performance 28.6% versus always using direct LLM inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06638","ref_index":22,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can RL Teach Long-Horizon Reasoning to LLMs? Expressiveness Is Key","primary_cat":"cs.AI","submitted_at":"2026-05-07T17:48:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RL training compute for logical reasoning follows a power law with horizon depth whose exponent rises with logical expressiveness, yielding better downstream transfer when models train on richer logics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06522","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic AIs Are the Missing Paradigm for Out-of-Distribution Generalization in Foundation Models","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:29:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Agentic AI systems are required to overcome the parameter coverage ceiling that prevents foundation models from handling certain out-of-distribution cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06365","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Agent Loops to Deterministic Graphs: Execution Lineage for Reproducible AI-Native Work","primary_cat":"cs.AI","submitted_at":"2026-05-07T14:39:37+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Execution lineage models AI-native work as a DAG of computations with explicit dependencies, achieving perfect state preservation in controlled update tasks where loop-based agents introduce churn and contamination.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"execution-lineage system decides what the unit of computation is, what its dependencies are, and what exactly must be rerun when some upstream state changes. 2.4 Reasoning Traces, Search, and Intermediate Steps Work on chain-of-thought and related prompting methods establishes an important empirical premise for our paper: intermediate reasoning steps often improve final-task performance. Chain-of-Thought [6], Self-Consistency [7], Least- to-Most prompting [8], and scratchpad-style methods [5] all show gains from exposing intermediate computation. Search-based extensions such as Tree-of-Thoughts [15], Language Agent Tree Search [18], Reflexion [16], Self- Refine [17], and structured reflection [19] push farther by revising and exploring alternative reasoning trajectories; earlier work on workflow-guided exploration [20] foreshadows the same interest in reusable action structure."},{"citing_arxiv_id":"2605.05737","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReFlect: An Effective Harness System for Complex Long-Horizon LLM Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-07T06:29:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReFlect is a harness that wraps LLMs to detect and recover from reasoning errors, achieving 7-29 pp gains over direct CoT on long-horizon tasks and improving code patch quality to 82-87%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08221","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NoisyCoconut: Counterfactual Consensus via Latent Space Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-06T13:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Injecting noise into LLM latent trajectories creates diverse reasoning paths whose agreement acts as a confidence signal for selective abstention, cutting error rates from 40-70% to under 15% on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03344","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RAG over Thinking Traces Can Improve Reasoning Tasks","primary_cat":"cs.IR","submitted_at":"2026-05-05T04:03:28+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02035","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VIDA: A dataset for Visually Dependent Ambiguity in Multimodal Machine Translation","primary_cat":"cs.CL","submitted_at":"2026-05-03T19:55:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.01336","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Multi-View Media Profiling Suite: Resources, Evaluation, and Analysis","primary_cat":"cs.CL","submitted_at":"2026-05-02T09:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents MBFC-2025 dataset and multi-view embeddings with fusion methods for media bias and factuality, reporting SOTA results on ACL-2020 and new benchmarks on MBFC-2025.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.24464","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Incisor: Ex Ante Cloud Instance Selection for HPC Jobs","primary_cat":"cs.DC","submitted_at":"2026-04-27T13:33:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Incisor uses program analysis and frontier LLMs to select working AWS EC2 instances ex ante for 100% of first-time HPC runs of C/C++/Fortran and Python codes, cutting runtime 54% and costs 44% versus an expert-constrained SkyPilot baseline.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Implementation details are given in Section VI-A. Each agent receives an agent configuration that provides the driver prompt and output schema defining the task, and input data that provides the task-specific evidence. For constraint estimation, the input data comprises the submitted executable, invocation command, and environment variables. Inspired by least-to-most prompting [42], the decomposer first invokes an LLM to break the configured task into a sequence of subtask definitions to produce findings that inform later subtasks. For the constraint estimation agent, these corre- spond to individual constraint dimensions such as CPU count, memory capacity, and platform requirements. The subtask executor then processes each subtask through a subtask loop"},{"citing_arxiv_id":"2604.24176","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Explanation Quality Assessment as Ranking with Listwise Rewards","primary_cat":"cs.AI","submitted_at":"2026-04-27T08:35:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Explanation quality assessment is recast as ranking with listwise and pairwise losses that outperform regression, allow small models to match large ones on curated data, and enable stable convergence in reinforcement learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.21027","ref_index":129,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HypEHR: Hyperbolic Modeling of Electronic Health Records for Efficient Question Answering","primary_cat":"cs.AI","submitted_at":"2026-04-22T19:18:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HypEHR is a hyperbolic embedding model for EHR data that uses Lorentzian geometry and hierarchy-aware pretraining to answer clinical questions nearly as well as large language models but with much smaller size.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19716","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Discovering a Shared Logical Subspace: Steering LLM Logical Reasoning via Alignment of Natural-Language and Symbolic Views","primary_cat":"cs.CL","submitted_at":"2026-04-21T17:42:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Applying Canonical Correlation Analysis to paired residual activations from natural-language and symbolic reasoning chains in LLMs reveals a low-dimensional shared logical subspace that can steer the model's reasoning for up to 11 percentage point accuracy gains on logical benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15709","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Bilevel Optimization of Agent Skills via Monte Carlo Tree Search","primary_cat":"cs.AI","submitted_at":"2026-04-17T05:31:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Bilevel optimization with outer-loop MCTS for skill structure and inner-loop LLM refinement improves agent accuracy on an operations-research question-answering dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.15529","ref_index":56,"ref_count":3,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LACE: Lattice Attention for Cross-thread Exploration","primary_cat":"cs.AI","submitted_at":"2026-04-16T21:19:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LACE enables concurrent reasoning paths in LLMs to interact via lattice attention and a synthetic training pipeline, raising accuracy more than 7 points over independent parallel search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13521","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"C-voting: Confidence-Based Test-Time Voting without Explicit Energy Functions","primary_cat":"cs.LG","submitted_at":"2026-04-15T06:10:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"C-voting improves recurrent reasoning models by selecting among multiple latent trajectories the one with highest average top-1 probability, achieving 4.9% better Sudoku-hard accuracy than energy-based voting and outperforming HRM on Sudoku-extreme and Maze when paired with the new ItrSA++ model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.00847","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"H-Probes: Extracting Hierarchical Structures From Latent Representations of Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-15T00:59:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"H-probes locate low-dimensional subspaces encoding hierarchy in LLM activations for synthetic tree tasks, show causal importance and generalization, and detect weaker signals in mathematical reasoning traces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.12088","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Structured Safety Auditing for Balancing Code Correctness and Content Safety in LLM-Generated Code","primary_cat":"cs.SE","submitted_at":"2026-04-13T21:52:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dual Reasoning with explicit safety audits improves the new SUDS metric by 1.32x to 3.42x over baselines on code generation benchmarks containing injected harmful keywords.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18768","ref_index":293,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ClinQueryAgent: A Conversational Agent for Population Health Management","primary_cat":"cs.IR","submitted_at":"2026-04-13T10:07:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper introduces ClinQueryAgent, a conversational agent that converts natural language queries into database queries for population health management while keeping patient data secure, and reports its use by 128 staff across 15 NHS practices covering 148,319 patients.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10990","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When Verification Fails: How Compositionally Infeasible Claims Escape Rejection","primary_cat":"cs.CL","submitted_at":"2026-04-13T04:48:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AI claim verification models rely on salient-constraint shortcuts instead of full compositional reasoning under the closed-world assumption, as revealed by their over-acceptance of claims with supported salient constraints but contradicted non-salient ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10734","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Self-Correcting RAG: Enhancing Faithfulness via MMKP Context Selection and NLI-Guided MCTS","primary_cat":"cs.CL","submitted_at":"2026-04-12T17:14:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-Correcting RAG formalizes retrieval as MMKP to maximize information density under token limits and uses NLI-guided MCTS to validate faithfulness, raising accuracy and cutting hallucinations on six multi-hop QA and fact-checking datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10693","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FACT-E: Causality-Inspired Evaluation for Trustworthy Chain-of-Thought Reasoning","primary_cat":"cs.AI","submitted_at":"2026-04-12T15:35:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FACT-E uses controlled perturbations as an instrumental signal to measure intra-chain faithfulness in CoT reasoning and combines it with answer consistency to select trustworthy trajectories.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Question: Convert the point(0,3)in rectangular coordinates to polar coordinates. Enter your answer in the form(r, θ),wherer >0and0≤θ <2π. 1 (3, π 2 ) ! To convert from rectangular coordinates (x, y) to polar coordinates (r, θ), we use the formulas r= p x2 +y 2 and θ= arctan y x \u0001 . Given the point (0,3), we have r= √ 02 + 32 = 3 and θ= arctan 3 0 \u0001 = π 2 . However, we need to adjust θ to be in the interval[0,2π),so the final answer is(3, π 2 ). 0.5992 2 (3, π 2 ) ! To convert a point from rectangular coordinates(x, y) to polar coordinates (r, θ), we use the formulas r= p x2 +y 2 and θ= arctan y x \u0001 . Given the point (0,3) , we have x= 0 and y= 3 . Calculating r= √ 02 + 32 = 3 and θ= arctan 3 0 \u0001 = π 2 (since arctan is undefined at x= 0 but we know the point lies on the positive y-axis). Therefore, the point (0,3) in rectangular coordinates is equivalent to the point(3, π 2 )in polar coordinates. 0.7995 Table 7: Question and Answer Evaluation Results, the flawed part of CoT is highlighted in red. The correct and wrong answers are shown in!and%, respectively. 5 Examples 10 Examples 15 Examples Method DeepSeek-V3 Qwen3-14B DeepSeek-V3 Qwen3-14B DeepSeek-V3 Qwen3-14B Standard CoT 92.35 90.16 92.35 91.53 92.62 92.62 Denoise 92.62 85.79 94.54 89.34 92.08 90.16 Polish 94.81 92.08 92.62 89.07 91.80 90.98 Reflect 93.44 88.25 94.54 87.98 90.44 90.71 Consistency 90.98 91.26 93.17 90.98 93.00 89.89 Ours 93.44 92.62 93.20 92.35 93.44 93.26 Table 8: Performance comparison of DeepSeek-V3 and Qwen3-14B on MATH-500 across different numbers of prompting examples. iments in § 5, specifically examining how perfor- mance changes as the number of demonstration examples increases, as illustrated in Table 8. Based on the experimental results, the proposed method (\"Ours\") demonstrates strong and consistent per- formance across both DeepSeek-V3 and Qwen3- 14B models, achieving the highest or competitive accuracy in all example-count settings. While in- creasing the number of in-con"},{"citing_arxiv_id":"2604.10517","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Perception to Planning: Evolving Ego-Centric Task-Oriented Spatiotemporal Reasoning via Curriculum Learning","primary_cat":"cs.AI","submitted_at":"2026-04-12T08:14:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EgoTSR applies a three-stage curriculum on a 46-million-sample dataset to build egocentric spatiotemporal reasoning, reaching 92.4% accuracy on long-horizon tasks and reducing chronological biases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10072","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Reason Only When Needed: Efficient Generative Reward Modeling via Model-Internal Uncertainty","primary_cat":"cs.CL","submitted_at":"2026-04-11T07:35:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"E-GRM triggers CoT reasoning in generative reward models only when parallel generations show high uncertainty, reducing inference cost and raising accuracy on reasoning benchmarks via a hybrid regression-ranking scorer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09015","ref_index":23,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Generative AI Agent Empowered Power Allocation for HAP Propulsion and Communication Systems","primary_cat":"cs.NI","submitted_at":"2026-04-10T06:25:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A generative AI agent creates a realistic HAP propulsion power model including aerodynamic interference and enables a Q3E beamforming algorithm that improves QoS and energy efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09741","ref_index":36,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ExecTune: Effective Steering of Black-Box LLMs with Guide Models","primary_cat":"cs.LG","submitted_at":"2026-04-09T23:27:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ExecTune trains guide models via acceptance sampling, supervised fine-tuning, and structure-aware RL to boost executability of strategies for black-box LLMs, yielding up to 9.2% higher accuracy and 22.4% lower cost on math and code tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.08299","ref_index":59,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SeLaR: Selective Latent Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-09T14:32:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SeLaR selectively applies latent soft reasoning in LLMs via entropy gating and contrastive regularization, outperforming standard CoT on five benchmarks without training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07321","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Syntax Is Easy, Semantics Is Hard: Evaluating LLMs for LTL Translation","primary_cat":"cs.LO","submitted_at":"2026-04-08T17:36:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLMs handle LTL syntax better than semantics, improve with detailed prompts, and perform substantially better when the task is reframed as Python code completion.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06902","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"iTAG: Inverse Design for Natural Text Generation with Accurate Causal Graph Annotations","primary_cat":"cs.CL","submitted_at":"2026-04-08T09:59:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"iTAG generates natural text paired with accurate causal graph annotations by framing concept assignment as an inverse problem and refining selections via chain-of-thought reasoning until the text's relations align with the target causal structure.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16421","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Measuring Representation Robustness in Large Language Models for Geometry","primary_cat":"cs.CL","submitted_at":"2026-04-03T11:36:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs display accuracy gaps of up to 14 percentage points on the same geometry problems solely due to representation choice, with vector forms consistently weakest and a convert-then-solve prompt helping only high-capacity models.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[37] Shunyu Yao, Dian Yu, Jeffrey Zhao, et al. 2023. Tree of thoughts: Deliberate problem solving with large language models. InProceedings of NeurIPS 2023. https://arxiv.org/abs/2305. 10601. [38] Minghao Zhang, Shuo Wang, Xiao Liu, et al. 2024. Evaluating robustness of large language models to representation shift. InProceedings of ACL 2024. https://arxiv.org/abs/2402. 01234. [39] Denny Zhou, Nino Sch¨ arli, Luheng He, et al. 2023. Least-to-most prompting enables complex reasoning in large language models. InProceedings of ICLR 2023. https://arxiv.org/abs/ 2205.10625. [40] Kaijie Zhu, Jindong Wang, Jiaheng Zhou, et al. 2023. PromptBench: Towards evaluating the robustness of large language models on adversarial prompts.arXiv preprint arXiv:2306."},{"citing_arxiv_id":"2604.02776","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Evaluating the Environmental Impact of using SLMs and Prompt Engineering for Code Generation","primary_cat":"cs.SE","submitted_at":"2026-04-03T06:37:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chain-of-Thought prompting balances high accuracy with low energy use in small language models for code generation, while multi-sampling strategies add high energy costs for small accuracy gains.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"quantified the sustainability trade-offs of prompting strategies in code generation using SLMs. This paper presents the first com- prehensive empirical study examining the impact of prompting approaches on both accuracy and environmental impact. We evalu- ate six prompting strategies-(1) Direct, (2) Chain-of-Thought [ 44], (3) Program-of-Thought [7], (4) Self-Consistency [42], (5) Least-to- Most [47], and (6) ReAct [46]-across 11 open-source models (1B- 34B parameters) on HumanEval+ [6] and MBPP+ [5] benchmarks, measuring Pass@1 accuracy alongside energy consumption (kWh), carbon emissions (kg CO2eq), inference latency, and token utiliza- tion across heterogeneous hardware and geographic regions. To ensure a realistic evaluation, all strategies are implemented within"},{"citing_arxiv_id":"2604.02761","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sustainability Analysis of Prompt Strategies for SLM-based Automated Test Generation","primary_cat":"cs.SE","submitted_at":"2026-04-03T06:10:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Prompt strategies for SLM-based automated test generation vary widely in energy consumption and carbon emissions, with simpler strategies delivering competitive coverage at markedly lower environmental cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.00131","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Oblivion: Self-Adaptive Agentic Memory Control through Decay-Driven Activation","primary_cat":"cs.CL","submitted_at":"2026-03-31T18:37:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Oblivion is a decay-driven memory framework that decouples read and write paths in LLM agents to enable adaptive forgetting and reinforcement for better long-horizon reasoning.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"2025a. Qwen3 embedding: Advancing text embedding and reranking through foundation models. arXiv preprint arXiv:2506.05176. Zeyu Zhang, Quanyu Dai, Xiaohe Bo, Chen Ma, Rui Li, Xu Chen, Jieming Zhu, Zhenhua Dong, and Ji-Rong Wen. 2025b. A survey on the memory mechanism of large language model-based agents.ACM Transac- tions on Information Systems, 43(6):1-47. Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-bench and chatbot arena. InAdvances in Neural Information Processing Sys- tems, volume 36. Curran Associates, Inc. Wanjun Zhong, Lianghong Guo, Qiqi Gao, He Ye, and"},{"citing_arxiv_id":"2604.16382","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LiFT: Does Instruction Fine-Tuning Improve In-Context Learning for Longitudinal Modelling by Large Language Models?","primary_cat":"cs.CL","submitted_at":"2026-03-25T20:54:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LiFT instruction fine-tunes LLMs with a temporal curriculum to improve in-context learning on longitudinal NLP tasks, yielding gains on out-of-distribution data and rare change events across multiple model sizes.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16359","ref_index":220,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM4Log: A Systematic Review of Large Language Model-based Log Analysis","primary_cat":"cs.SE","submitted_at":"2026-03-18T20:34:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Systematic review of 145 papers on LLM-based log analysis, providing a unified taxonomy, common design patterns, evaluation practices, and challenges for deployment under drift and limited labels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.06870","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LEAD: Breaking the No-Recovery Bottleneck in Long-Horizon Reasoning","primary_cat":"cs.AI","submitted_at":"2026-03-06T20:42:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LEAD lets LLMs solve checkers jumping puzzles up to size 13 by using lookahead to recover from irreversible errors on hard steps that break extreme decomposition.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}