{"total":27,"items":[{"citing_arxiv_id":"2606.25561","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CrypFormBench: Benchmarking Formal Analysis Capability of Large Language Models for Cryptographic Schemes","primary_cat":"cs.CR","submitted_at":"2026-06-24T08:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrypFormBench is a new benchmark jointly covering symbolic and computational security to evaluate LLMs on five formal analysis capabilities, with results showing top model Claude-3.5 scores 48.7/100 and most models struggling on generation, transformation, and correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07157","ref_index":39,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Think Fast: Estimating No-CoT Task-Completion Time Horizons of Frontier AI Models","primary_cat":"cs.AI","submitted_at":"2026-06-05T11:17:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Frontier AI models' no-CoT 50% task-completion time horizons have doubled yearly over six years, reaching over 3 minutes for GPT-5.5 with projections to 25 minutes by 2030.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05464","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Step-by-Step Optimization-like Reasoning in LLMs over Expanding Search Spaces","primary_cat":"cs.AI","submitted_at":"2026-06-03T21:43:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces OPT* tasks and two training regimes (solver-guided online policy optimization with rank-based reward shaping and search-based offline RL) plus a theoretical link between search success and information extraction per budget unit, showing empirical gains in optimization-like reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20722","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AGPO: Adaptive Group Policy Optimization with Dual Statistical Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-20T05:20:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"AGPO adaptively sets trust-region size and exploration temperature from group reward dispersion, entropy, and KL drift, yielding higher scores than PPO and GRPO on nine math benchmarks under fixed token budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18073","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A-ProS: Towards Reliable Autonomous Programming Through Multi-Model Feedback","primary_cat":"cs.SE","submitted_at":"2026-05-18T08:55:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A-ProS uses a hybrid multi-model feedback framework with stateful refinement to improve success rates on competitive programming problems, achieving over 2x gains compared to baseline agent loops.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16142","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Property-Guided LLM Program Synthesis for Planning","primary_cat":"cs.AI","submitted_at":"2026-05-15T16:23:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Property-guided LLM program synthesis with counterexample feedback creates direct heuristics for PDDL planning domains that require far fewer generations and less evaluation cost than score-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11299","ref_index":65,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Primal Generation, Dual Judgment: Self-Training from Test-Time Scaling","primary_cat":"cs.LG","submitted_at":"2026-05-11T22:34:45+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DuST self-trains LLMs for code generation by ranking their own test-time samples via sandbox execution and applying GRPO, improving judgment by +6.2 NDCG and single-sample pass@1 by +3.1 on LiveCodeBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06111","ref_index":26,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Schedule-and-Calibrate: Utility-Guided Multi-Task Reinforcement Learning for Code LLMs","primary_cat":"cs.SE","submitted_at":"2026-05-07T12:24:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ASTOR improves a single code LLM across four tasks by 9.0-9.5% over the best specialist and 7.5-12.8% over prior multi-task RL baselines via utility-driven data scheduling and adaptive KL regularization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.02741","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AI-Generated Smells: An Analysis of Code and Architecture in LLM and Agent-Driven Development","primary_cat":"cs.SE","submitted_at":"2026-05-04T15:41:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"More capable LLMs and agents generate code with greater volume and architectural decay, following a Volume-Quality Inverse Law that neither functional correctness nor prompting mitigates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10158","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Tracing the Thought of a Grandmaster-level Chess-Playing Transformer","primary_cat":"cs.LG","submitted_at":"2026-04-11T11:11:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sparse replacement layers decompose the MLP and attention modules of a chess-playing transformer to reveal verifiable tactical reasoning pathways and parallel computation patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10126","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MR-Coupler: Automated Metamorphic Test Generation via Functional Coupling Analysis","primary_cat":"cs.SE","submitted_at":"2026-04-11T09:42:44+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MR-Coupler leverages functional coupling analysis and LLMs to generate valid metamorphic test cases for over 90% of tasks while detecting 44% of real bugs, outperforming baselines by 64.90% in validity and 36.56% in false-alarm reduction.","context_count":1,"top_context_role":"background","top_context_polarity":"support","context_text":"(1)readily-available knowledge: it relies solely on a pair of methods and their implementation, which is by construction available in the scenario of unit testing; (2)more tractable problem: this transforms the challenging problem of deriving MRs into code understanding and relation reasoning, which can be effectively handled by current state-of-the-art large language models (LLMs) [35, 53, 66, 70]. For instance, although it is challenging to come up with MRs for a target method encrypt, when paired with a coupled method decrypt, it becomes easier for LLMs to understand their functionalities separately, realize that they are inverse functions, and then formulate a relation 𝑥=𝑑𝑒𝑐𝑟𝑦𝑝𝑡(𝑒𝑛𝑐𝑟𝑦𝑝𝑡(𝑥)) ; (3)easier bug manifestation: certain bugs can be revealed more easily"},{"citing_arxiv_id":"2604.06231","ref_index":33,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Automating Database-Native Function Code Synthesis with LLMs","primary_cat":"cs.DB","submitted_at":"2026-04-02T02:56:04+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DBCooker automates synthesis of database native functions via LLM-guided characterization, coding plans, hybrid filling, and progressive validation, delivering 34.55% higher accuracy than baselines on SQLite, PostgreSQL, and DuckDB while generating functions absent from SQLite 3.50.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"blindly including everything incurs high inference costs, and effective synthesis still requires reliably identifying a small set of relevant, scattered units among a vast amount of irrelevant code.DBCooker'sFunction Characterizationexplicitly identifies these units, preventingLLMs from missing critical information that long-context reasoning over the entire codebase alone may overlook [33, 40]. (2) Deterministic Correctness Requirements vs. Probabilistic Generative Synthesis:Data- base systems are mission-critical, and their functions must satisfy strict, deterministic correctness guarantees. Even if futureLLMsbecome more reliable, their outputs are still based on probabilis- tic generation and cannot inherently guarantee exact database correctness [19, 48]."},{"citing_arxiv_id":"2603.11287","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Synthesis-in-the-Loop Evaluation of LLMs for RTL Generation: Quality, Reliability, and Failure Modes","primary_cat":"cs.AR","submitted_at":"2026-03-11T20:26:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLM evaluation for RTL generation identifies three performance tiers with frontier models reaching high synthesis quality and reveals systematic failure differences between proprietary and open models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09557","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SPEED-Bench: A Unified and Diverse Benchmark for Speculative Decoding","primary_cat":"cs.DC","submitted_at":"2026-02-10T16:19:56+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.14232","ref_index":11,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Scaling Test-Time Compute to Achieve IOI Gold Medal with Open-Weight Models","primary_cat":"cs.LG","submitted_at":"2025-10-16T02:19:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GenCluster scales test-time compute via large-scale generation, behavioral clustering, ranking, and round-robin submission to achieve IOI gold medal performance with the open-weight gpt-oss-120b model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.02497","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A PennyLane-Centric Dataset to Enhance LLM-based Quantum Code Generation using RAG","primary_cat":"cs.SE","submitted_at":"2025-03-04T11:04:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PennyLang dataset of 3,347 PennyLane samples boosts LLM code generation success via RAG from 8.7% to 41.7% for Qwen 7B and 78.8% to 84.8% for LLaMa 4.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.01456","ref_index":152,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Process Reinforcement through Implicit Rewards","primary_cat":"cs.LG","submitted_at":"2025-02-03T15:43:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRIME enables online process reward model updates in LLM RL using implicit rewards from rollouts and outcome labels, yielding 15.1% average gains on reasoning benchmarks and surpassing a stronger instruct model with 10% of the data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2408.15815","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MR-Adopt: Automatic Deduction of Input Transformation Function for Metamorphic Testing","primary_cat":"cs.SE","submitted_at":"2024-08-28T14:24:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MR-Adopt deduces input transformations from hard-coded MR test cases using LLMs, data-flow refinement, and output-relation selection to enable reuse with new source inputs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.17762","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Massive Activations in Large Language Models","primary_cat":"cs.CL","submitted_at":"2024-02-27T18:55:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Massive activations are constant large values in LLMs that function as indispensable bias terms and concentrate attention probabilities on specific tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.13228","ref_index":171,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Smaug: Fixing Failure Modes of Preference Optimisation with DPO-Positive","primary_cat":"cs.CL","submitted_at":"2024-02-20T18:42:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DPOP is a new loss function that prevents DPO from lowering preferred response likelihoods and outperforms standard DPO on diverse datasets, MT-Bench, and enables Smaug-72B to exceed 80% on the Open LLM Leaderboard.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.16291","ref_index":89,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Voyager: An Open-Ended Embodied Agent with Large Language Models","primary_cat":"cs.AI","submitted_at":"2023-05-25T17:46:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Voyager achieves superior lifelong learning in Minecraft by combining an automatic exploration curriculum, a library of executable skills, and iterative LLM prompting with environment feedback, yielding 3.3x more unique items and 15.3x faster milestone unlocks than prior methods while generalizing技能","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Code generation has been a longstanding challenge in NLP [41, 84, 85, 73, 37], with various works leveraging execution results to improve program 10 synthesis. Execution-guided approaches leverage intermediate execution outcomes to guide program search [86-88]. Another line of research utilizes majority voting to choose candidates based on their execution performance [89, 90]. Additionally, LEVER [91] trains a verifier to distinguish and reject incorrect programs based on execution results. CLAIRIFY [92], on the other hand, generates code for planning chemistry experiments and makes use of a rule-based verifier to iteratively provide error feedback to LLMs. VOYAGER distinguishes itself from these works by integrating environment"},{"citing_arxiv_id":"2305.10403","ref_index":286,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PaLM 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2023-05-17T17:46:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"PaLM 2 reports state-of-the-art results on language, reasoning, and multilingual tasks with improved efficiency over PaLM.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.07922","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CodeT5+: Open Code Large Language Models for Code Understanding and Generation","primary_cat":"cs.CL","submitted_at":"2023-05-13T14:23:07+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CodeT5+ is a flexible encoder-decoder LLM family for code pretrained with diverse objectives on multilingual corpora and initialized from existing LLMs, achieving state-of-the-art results on code generation, completion, math programming, and retrieval tasks including new SoTA on HumanEval with the 1","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2305.06161","ref_index":132,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"StarCoder: may the source be with you!","primary_cat":"cs.CL","submitted_at":"2023-05-09T08:16:42+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StarCoderBase matches or beats OpenAI's code-cushman-001 on multi-language code benchmarks; the Python-fine-tuned StarCoder reaches 40% pass@1 on HumanEval while retaining other-language performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.05100","ref_index":265,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"BLOOM: A 176B-Parameter Open-Access Multilingual Language Model","primary_cat":"cs.CL","submitted_at":"2022-11-09T18:48:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BLOOM is a 176B-parameter open-access multilingual language model trained on the ROOTS corpus that achieves competitive performance on benchmarks, with improved results after multitask prompted finetuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2211.01910","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Large Language Models Are Human-Level Prompt Engineers","primary_cat":"cs.LG","submitted_at":"2022-11-03T15:43:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"APE generates instruction candidates via LLM and selects the best by zero-shot performance of a second LLM, matching or beating human prompts on 19 of 24 NLP tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.05999","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"InCoder: A Generative Model for Code Infilling and Synthesis","primary_cat":"cs.SE","submitted_at":"2022-04-12T16:25:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"InCoder is the first generative model to directly perform zero-shot code infilling via bidirectional context from a masked-then-appended training scheme, matching left-to-right models on synthesis while improving on type inference, comment generation, and variable renaming.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}