{"total":42,"items":[{"citing_arxiv_id":"2607.00399","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DriveVer: Lightweight Trajectory Evaluator as Test-Time Verifier for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-07-01T03:50:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DriveVer is a lightweight dual-head test-time verifier that predicts safety confidence scores and geometric refinement vectors for candidate trajectories, improving base planners on the NAVSIM benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28661","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When More Sampling Hurts: The Modal Ceiling and Correlation Ceiling of Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-06-27T00:37:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Test-time sampling improves coverage but stalls at modal and correlation ceilings for answer selection, with the effective number of samples as the practical limit.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.17890","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dynamic Rollout Editing for Reducing Overthinking in RL-Trained Reasoning Models","primary_cat":"cs.CL","submitted_at":"2026-06-16T13:10:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Dynamic Rollout Editing reduces overthinking in RL-trained LLMs by editing post-answer continuations in successful rollouts and preferring the edited versions within GRPO groups.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08231","ref_index":111,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Test-Time Scaling in Multimodal Foundation Models: A Comprehensive Survey of Generation and Reasoning","primary_cat":"cs.CV","submitted_at":"2026-06-06T15:39:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of test-time scaling for multimodal foundation models that introduces a three-way taxonomy of sampling, feedback, and search approaches along with applications and benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06906","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EASE-TTT: Evidence-Aligned Selective Test-Time Training for Long-Context Question Answering","primary_cat":"cs.CL","submitted_at":"2026-06-05T04:49:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EASE-TTT creates a soft attention target from evidence chunks to guide query-side test-time adaptation, yielding higher macro-average scores than full-context, retrieval-only, and standard qTTT baselines on six LongBench QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03102","ref_index":95,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Small RL Controller, Large Language Model: RL-Guided Adaptive Sampling for Test-Time Scaling","primary_cat":"cs.CL","submitted_at":"2026-06-02T03:42:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RL-trained lightweight controller using answer statistics improves trade-offs among correctness, latency, and total samples in adaptive sampling for LLM test-time scaling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31561","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"What Am I Missing? Question-Answering as Hidden State Probing","primary_cat":"cs.CL","submitted_at":"2026-05-29T17:27:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Question generation produces a hidden-state signal that predicts final correctness before the answer is produced, yet gating interventions based on that signal do not reliably improve trajectories.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27596","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can Hallucinations Be Useful? Solving Multi-Hop Questions With SLMs By Chaining System-I/II Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-26T19:09:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SLMs solve multi-hop QA by first producing a quick answer and then retrieving evidence based on that hypothesis for System-II reasoning, outperforming think-first baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27030","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Share More, Search Less: Collaborative Parallel Thinking for Efficient Test-Time Scaling","primary_cat":"cs.CL","submitted_at":"2026-05-26T13:52:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CPT shares deduplicated intermediate information across parallel search branches at inference time, yielding a stronger accuracy-latency Pareto frontier than isolated-branch baselines on HMMT and AIME.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.25547","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TapSampling: Inference-Time Sampling with a Task-Progress-Understanding Verifier for Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2026-05-25T08:03:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TapSampling improves generalist robotic manipulation policies at inference time via latent action sampling with an Action-VAE and selection by a task-progress outcome predictor.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18233","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Enhancing Train-Free Infinite-Frame Generation for Consistent Long Videos","primary_cat":"cs.CV","submitted_at":"2026-05-18T11:28:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MIGA introduces two-stage alignment to close train-inference gaps and dual consistency enhancement via self-reflection and long-range guidance to achieve SOTA temporal consistency in infinite-frame video generation on VBench and NarrLV.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11662","ref_index":67,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HSUGA: LLM-Enhanced Recommendation with Hierarchical Semantic Understanding and Group-Aware Alignment","primary_cat":"cs.IR","submitted_at":"2026-05-12T07:22:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HSUGA improves LLM-enhanced sequential recommendation via staged hierarchical semantic understanding for better preference extraction and group-aware alignment that varies intensity by user activity level.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11625","ref_index":51,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Nice Fold or Hero Call: Learning Budget-Efficient Thinking for Adaptive Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-12T06:51:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BET reduces reasoning tokens by about 55% on average while improving performance across benchmarks by learning to short-solve easy queries, fold early on unsolvable ones, and preserve budget for hard solvable queries.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"productive exploration and weaken complex reasoning on hard-but-solvable queries. Difficulty-conditioned reasoning control.A second line modulates reasoning depth with difficulty- related signals. DEER [47] uses confidence stabilization for early exit, while ThinkSwitcher [21] and DiffAdapt [24] route queries across reasoning modes. Other methods rely on proxy labels such as baseline token counts or predefined budgets [51, 43]. Such priors provide coarse control but do not track model-specific capability boundaries [7, 24]. More recent controllers use model-side signals: ASRR [49] relaxes compression when shortening harms correctness, DR.SAF [7] assigns incentives from grouped rollout pass rates, AdaCtrl [19] introduces difficulty-tagged controllable modes, and CODA [44] conditions compute on online difficulty scores."},{"citing_arxiv_id":"2605.10754","ref_index":49,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"The Agent Use of Agent Beings: Agent Cybernetics Is the Missing Science of Foundation Agents","primary_cat":"cs.AI","submitted_at":"2026-05-11T15:53:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Agent Cybernetics reframes foundation agent design by adapting classical cybernetics laws into three engineering desiderata for reliable, long-running, self-improving agents.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"transiently deviate from their local viability regions (Principle 3), a redundant verifier serves as a collective homeostatic mechanism, detecting individual deviations that the primary agent cannot self-monitor. This mirrors fault-tolerance patterns from classical systems engineering, now applied to LLM-based reasoning. We note that test-time scaling (e.g., majority voting and best-of-N) [49], multi- agent debate [9, 46] and LLM-as-a-Judge [13] can be specific implementations of this redundancy mechanism. Open research questions include how to assign agent roles to maximize collective variety without redundant overlap, how to aggregate conflicting reliability assessments, and how to bound the communication overhead of coordinated verification [12, 19]."},{"citing_arxiv_id":"2605.08905","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Forge: Quality-Aware Reinforcement Learning for NP-Hard Optimization in LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-09T11:57:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OPT-BENCH trains LLMs on NP-hard optimization via quality-aware RLVR, achieving 93.1% success rate and 46.6% quality ratio on Qwen2.5-7B while outperforming GPT-4o and transferring gains to other domains.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"a valid solution is[0,1,4] , since 2+3+5 = 10 , and the subset uses three elements, which is maximal. The difficulty of generated problem instances is categorized according to the number of integers available (|numbers|), the typical size of the opti- mal solution (|I|), and the range of integer values: •Easy: - Total numbers ∈[5,10] , solution size ∈[4,8], values in[1,5]. - Small input with low values, ensuring frequent feasible solutions. •Medium: - Total numbers ∈[8,12] , solution size ∈[4,8], values in[1,10]. - Moderate instance size and range, requir- ing more careful subset selection. •Hard: - Total numbers ∈[12,15] , solution size ∈[8,12], values in[1,15]. - Larger solution sizes and wider value"},{"citing_arxiv_id":"2605.08057","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CA-SQL: Complexity-Aware Inference Time Reasoning for Text-to-SQL via Exploration and Compute Budget Allocation","primary_cat":"cs.CL","submitted_at":"2026-05-08T17:44:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CA-SQL achieves 51.72% execution accuracy on the challenging tier of the BIRD benchmark using GPT-4o-mini by scaling exploration breadth according to estimated task difficulty, evolutionary prompt seeding, and candidate voting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07177","ref_index":42,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HyperEyes: Dual-Grained Efficiency-Aware Reinforcement Learning for Parallel Multimodal Search Agents","primary_cat":"cs.LG","submitted_at":"2026-05-08T03:16:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HyperEyes presents a parallel multimodal search agent using dual-grained efficiency-aware RL with a new TRACE reward and IMEB benchmark, claiming 9.9% higher accuracy and 5.3x fewer tool calls than prior open-source agents.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"these challenges, recent frameworks have actively embraced the \"Think-Act-Observe\" paradigm. For instance, DeepMMSearch-R1 [22] and DeepEyesV2 [10] introduce \"thinking with images\" by executing active visual manipulations (e.g., cropping, rotating, or marking via generated code) to extract fine-grained features before initiating web retrieval. Meanwhile, agents like WebWatcher [8] and Skywork-R1V4 [42] integrate diverse tools (e.g., code interpreters, text/image search) through Reinforcement Learning (RL) or high-fidelity supervised fine-tuning to facilitate in-depth information seeking. Taking a broader approach, Vision-DeepResearch (VDR) [ 11] tackles hit-rate issues in noisy web environments by formalizing a multi-turn, multi-scale trial-and-error retrieval paradigm,"},{"citing_arxiv_id":"2605.05561","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BitCal-TTS: Bit-Calibrated Test-Time Scaling for Quantized Reasoning Models","primary_cat":"cs.AI","submitted_at":"2026-05-07T01:10:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"BitCal-TTS raises exact-match accuracy by 3.7 points (7B) and 2.8 points (14B) on small GSM8K shards for 4-bit Qwen2.5 models while cutting premature-stop rates and retaining token savings versus fixed-budget decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04461","ref_index":52,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Stream-T1: Test-Time Scaling for Streaming Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-06T03:40:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Stream-T1 is a test-time scaling framework for streaming video generation using scaled noise propagation from history, reward pruning across short and long windows, and feedback-guided memory sinking to improve temporal consistency and visual quality.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"alignment, coherent motion, and long-term temporal consistency remains an open challenge. Furthermore, the traditional paradigm of scaling up models during the training phase is hitting a ceiling, heavily constrained by exorbitant costs and resource demands. Recently, inspired by successes in Large Language Models, pioneering works[9, 28, 41, 54] have introduced Test-Time Scaling (TTS)[52] to video generation and have empirically proven that dynamically scaling computational budgets during inference phase offers a highly effective and cost-efficient pathway to boost video generation quality. Despite this promising potential, approach like ImagerySearch[41] rely on video diffusion models to synthesize the entire video simultaneously."},{"citing_arxiv_id":"2605.01194","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VLA-ATTC: Adaptive Test-Time Compute for VLA Models with Relative Action Critic Model","primary_cat":"cs.RO","submitted_at":"2026-05-02T02:13:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VLA-ATTC equips VLA models with adaptive test-time compute via an uncertainty clutch and relative action critic, cutting failure rates by over 50% on LIBERO-LONG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26644","ref_index":40,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"When to Vote, When to Rewrite: Disagreement-Guided Strategy Routing for Test-Time Scaling","primary_cat":"cs.AI","submitted_at":"2026-04-29T13:11:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A disagreement-guided routing framework dynamically selects among resolution, voting, and rewriting strategies for test-time scaling, delivering 3-7% accuracy gains with lower sampling cost on mathematical benchmarks.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"Reported costs cover the entire pipeline: for SCoP and our method, rewriting costs are included; for BoN, reward model costs are included. Each rewriting step is counted as one sampling. 4 Experiments 4.1 Experimental Settings We evaluate our approach on seven widely used mathematical reasoning benchmarks, including GSM8K [8], Math500 [ 15], Gaokao2023en [ 40], Olympiadbench [ 14], AMC23, AIME24, and AIME25, using three base models (Qwen3-4B [30], Qwen3-8B, and DeepSeek-R1-Distill-Llama-8B [9]). We use accuracy of the final answers as the evaluation metric. To better understand the performance of our method, we compare with four representative methods: majority voting, dynamic voting [33], Best-of-N with an outcome reward model, and SCoP [43]."},{"citing_arxiv_id":"2604.19341","ref_index":172,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Evaluation-driven Scaling for Scientific Discovery","primary_cat":"cs.LG","submitted_at":"2026-04-21T11:24:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SimpleTES scales test-time evaluation in LLMs to discover state-of-the-art solutions on 21 scientific problems across six domains, outperforming frontier models and optimization pipelines with examples like 2x faster LASSO and new Erdos constructions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18356","ref_index":39,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"ComPASS: Towards Personalized Agentic Social Support via Tool-Augmented Companionship","primary_cat":"cs.CL","submitted_at":"2026-04-20T14:49:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ComPASS creates tool-augmented LLM agents for substantive social support, releases the first personalized benchmark ComPASS-Bench, and fine-tunes ComPASS-Qwen to outperform its base model while matching larger LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17353","ref_index":45,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Hive: A Multi-Agent Infrastructure for Algorithm- and Task-Level Scaling","primary_cat":"cs.AI","submitted_at":"2026-04-19T09:59:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hive is a multi-agent infrastructure with a logits cache for reducing cross-path redundancy in sampling and agent-aware scheduling for better compute and KV-cache allocation, shown to deliver 1.11x-1.76x speedups and 33%-51% lower hotspot miss rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17288","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Clover: A Neural-Symbolic Agentic Harness with Stochastic Tree-of-Thoughts for Verified RTL Repair","primary_cat":"cs.AR","submitted_at":"2026-04-19T07:04:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Clover fixes 96.8% of bugs on an RTL-repair benchmark using stochastic tree-of-thoughts and neural-symbolic agents, outperforming traditional and LLM baselines by 94% and 63% respectively with 87.5% pass@1.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.14853","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adaptive Test-Time Compute Allocation for Reasoning LLMs via Constrained Policy Optimization","primary_cat":"cs.LG","submitted_at":"2026-04-16T10:39:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A Lagrangian-relaxation plus imitation-learning pipeline adaptively allocates test-time compute to LLMs, outperforming uniform baselines by up to 12.8% relative accuracy on MATH while staying within a fixed average budget.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Representations (ICLR), 2025. 10 [17] R. Liu, J. Gao, J. Zhao, K. Zhang, X. Li, B. Qi, W. Ouyang, and B. Zhou. Can 1B LLM surpass 405B LLM? Rethinking compute-optimal test-time scaling.arXiv preprint arXiv:2502.06703, 2025. [18] A. Agarwal, A. Sengupta, and T. Chakraborty. The art of scaling test-time compute for large language models.arXiv preprint arXiv:2512.02008, 2025. [19] Q. Zhang, F. Lyu, Z. Sun, L. Wang, W. Zhang, Z. Guo, Y . Wang, N. Muennighoff, I. King, X. Liu, and C. Ma. What, how, where, and how well? A survey on test-time scaling in large language models.arXiv preprint arXiv:2503.24235, 2025. [20] A. Jones. Scaling scaling laws with board games.arXiv preprint arXiv:2104.03113, 2021. [21] A. Graves. Adaptive computation time for recurrent neural networks."},{"citing_arxiv_id":"2604.14564","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MARS$^2$: Scaling Multi-Agent Tree Search via Reinforcement Learning for Code Generation","primary_cat":"cs.AI","submitted_at":"2026-04-16T02:52:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MARS² integrates multi-agent collaboration with tree-structured search in RL to boost code generation by increasing exploratory diversity and using path-level group advantages for credit assignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13552","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Training-Free Test-Time Contrastive Learning for Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-15T06:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TF-TTCL lets frozen LLMs adapt online by distilling textual rules from contrastive reasoning trajectories generated via multi-agent augmentation and applying them through retrieval-based steering.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11025","ref_index":41,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Test-time Scaling over Perception: Resolving the Grounding Paradox in Thinking with Images","primary_cat":"cs.CV","submitted_at":"2026-04-13T05:49:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TTSP resolves the Grounding Paradox by treating perception as a scalable test-time process that generates, filters, and iteratively refines multiple visual exploration traces, outperforming baselines on high-resolution and multimodal reasoning tasks.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"pressive progress on a wide range of vision-language tasks, in- cluding visual question answering, document understanding, and image-grounded dialogue [1, 3]. Building on these advances, re- cent research has moved beyond one-shot image understanding toward the paradigm ofThinking with Images, wherein models are equipped with visual tools such as zooming and cropping to inspect image regions during inference [41, 43]. This paradigm is appealing because it offers a potential path from coarse perception to targeted evidence acquisition: instead of relying exclusively on a fixed global ∗Corresponding author. representation of the image, the model can interact with the visual input and retrieve high-resolution information on demand. R∗[x\", y\", x#, y#] Needs to invoke toolOn R∗to get evidence"},{"citing_arxiv_id":"2604.10449","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AdverMCTS: Combating Pseudo-Correctness in Code Generation via Adversarial Monte Carlo Tree Search","primary_cat":"cs.SE","submitted_at":"2026-04-12T04:15:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AdverMCTS frames code generation as a minimax game where an attacker evolves tests to expose flaws in solver-generated code, yielding more robust outputs than static-test baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.06262","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"From Exposure to Internalization: Dual-Stream Calibration for In-context Clinical Reasoning","primary_cat":"q-bio.QM","submitted_at":"2026-04-07T01:59:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Dual-Stream Calibration uses entropy minimization and iterative meta-learning at test time to internalize clinical evidence and outperform standard in-context learning baselines on medical tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.08659","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CODA: Difficulty-Aware Compute Allocation for Adaptive Reasoning","primary_cat":"cs.CL","submitted_at":"2026-03-09T17:37:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CODA uses rollout-based difficulty signals to drive two gates that penalize verbosity on easy instances and promote deliberation on hard ones, cutting token use over 60% on simple tasks while maintaining accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.21484","ref_index":38,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ETS: Energy-Guided Test-Time Scaling for Training-Free RL Alignment","primary_cat":"cs.LG","submitted_at":"2026-01-29T10:06:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ETS performs training-free RL alignment for language models by energy-guided test-time scaling with Monte Carlo energy estimation and importance sampling acceleration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.14249","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Which Reasoning Trajectories Teach Students to Reason Better? A Simple Metric of Informative Alignment","primary_cat":"cs.CL","submitted_at":"2026-01-20T18:58:10+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.20206","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RAPO++: Cross-Stage Prompt Optimization for Text-to-Video Generation via Data Alignment and Test-Time Scaling","primary_cat":"cs.CV","submitted_at":"2025-10-23T04:45:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RAPO++ is a three-stage prompt optimization framework combining retrieval-augmented refinement, closed-loop test-time scaling, and LLM fine-tuning to enhance text-to-video generation quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.25758","ref_index":55,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking Sparks!: Emergent Attention Heads in Reasoning Models During Post Training","primary_cat":"cs.AI","submitted_at":"2025-09-30T04:23:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Post-training on reasoning tasks sparks the emergence of specialized attention heads that enable structured computation, with SFT adding stable heads while GRPO uses dynamic activation and pruning tied to reward signals, and controllable think models relying on compensatory heads instead of specific","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.00084","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to Refine: Self-Refinement of Parallel Reasoning in LLMs","primary_cat":"cs.LG","submitted_at":"2025-08-27T06:51:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GSR jointly trains LLMs to generate candidate solutions and refine a superior final answer from them, achieving state-of-the-art performance on five mathematical benchmarks while transferring across model scales.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.10164","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pruning Long Chain-of-Thought of Large Reasoning Models via Small-Scale Preference Optimization","primary_cat":"cs.AI","submitted_at":"2025-08-13T20:00:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LCPO reduces average LRM output length by over 50% across benchmarks via targeted preference optimization while preserving reasoning performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2508.04204","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ReasoningGuard: Safeguarding Large Reasoning Models with Inference-time Safety Aha Moments","primary_cat":"cs.CL","submitted_at":"2025-08-06T08:35:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ReasoningGuard is an inference-time method that uses attention mechanisms to inject safety aha moments and scaling sampling to defend large reasoning models against jailbreak attacks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.21046","ref_index":203,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Survey of Self-Evolving Agents: What, When, How, and Where to Evolve on the Path to Artificial Super Intelligence","primary_cat":"cs.AI","submitted_at":"2025-07-28T17:59:05+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper delivers the first systematic review of self-evolving agents, structured around what components evolve, when adaptation occurs, and how it is implemented.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.14362","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepEyes: Incentivizing \"Thinking with Images\" via Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2025-05-20T13:48:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepEyes uses reinforcement learning to teach vision-language models active perception and image-based thinking, yielding gains on perception, reasoning, grounding, and hallucination benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.12741","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Language Model Networks: Supervision-Efficient Learning through Dense Communication","primary_cat":"cs.AI","submitted_at":"2025-05-19T05:56:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}