{"total":30,"items":[{"citing_arxiv_id":"2605.21770","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Manifold-Guided Attention Steering","primary_cat":"cs.LG","submitted_at":"2026-05-20T22:06:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MAGS learns low-dimensional subspaces from correct versus incorrect reasoning traces and applies targeted projection corrections to attention heads when they deviate from the correctness manifold during inference.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14075","ref_index":58,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Rethinking Layer Relevance in Large Language Models Beyond Cosine Similarity","primary_cat":"cs.LG","submitted_at":"2026-05-13T19:51:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Cosine similarity poorly predicts performance degradation from layer removal in LLMs, making direct accuracy-drop ablation a more reliable relevance metric.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06651","ref_index":7,"ref_count":2,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AI co-mathematician: Accelerating mathematicians with agentic AI","primary_cat":"cs.AI","submitted_at":"2026-05-07T17:56:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"An interactive AI workbench for mathematicians achieves 48% on FrontierMath Tier 4 and helped solve open problems in early tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08221","ref_index":31,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"NoisyCoconut: Counterfactual Consensus via Latent Space Reasoning","primary_cat":"cs.LG","submitted_at":"2026-05-06T13:58:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Injecting noise into LLM latent trajectories creates diverse reasoning paths whose agreement acts as a confidence signal for selective abstention, cutting error rates from 40-70% to under 15% on math tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10079","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Why Supervised Fine-Tuning Fails to Learn: A Systematic Study of Incomplete Learning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-11T07:55:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Supervised fine-tuning of LLMs often fails to fully internalize all training instances due to five recurring causes including missing prerequisites and data conflicts, as diagnosed via a new framework across multiple models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.04062","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EdgeRazor: A Lightweight Framework for Large Language Models via Mixed-Precision Quantization-Aware Distillation","primary_cat":"cs.LG","submitted_at":"2026-04-10T15:49:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EdgeRazor uses structural mixed-precision quantization, layer-adaptive feature distillation, and entropy-aware KL divergence to achieve 1.88-bit LLMs that outperform prior 2-bit and 3-bit baselines with 4-10x lower training budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.07655","ref_index":88,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Guardian-as-an-Advisor: Advancing Next-Generation Guardian Models for Trustworthy LLMs","primary_cat":"cs.LG","submitted_at":"2026-04-08T23:47:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Guardian-as-an-Advisor prepends risk labels and explanations from a guardian model to queries, improving LLM safety compliance and reducing over-refusal while adding minimal compute overhead.","context_count":1,"top_context_role":"dataset","top_context_polarity":"use_dataset","context_text":"♂shield-altRobustnesscommonsense_qa[70] 500 ♂shield-altRobustnessmmlu[24] 1000 ♂shield-altRobustnessmnli[81] 1000 ♂shield-altRobustnessqnli[74] 500 ♂shield-altRobustnesssst2[67] 500 ♂shield-altRobustnesstrivia_qa[37] 1000 ♂shield-altRobustnesstruthful_qa[41] 200 ♂shield-altRobustnessultrachat[15] 3000 ♂skull-crossbonesToxicity FredZhang7-toxi-text-3M [88] 10000 ♂skull-crossbonesToxicity JBB-Behaviors[8] 100 ♂skull-crossbonesToxicity PKU-SafeRLHF-QA[71] 5827 ♂skull-crossbonesToxicity StrongReject[68] 313 ♂skull-crossbonesToxicity TrustLLM-misuse[27] 1174 ♂skull-crossbonesToxicity Wildjailbreak_vanilla[36] 20000 ♂skull-crossbonesToxicity harmful-dataset[3] 4948 ♂skull-crossbonesToxicity llm_attack_harmful_behaviors"},{"citing_arxiv_id":"2602.22911","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CeRA: Overcoming the Linear Ceiling of Low-Rank Adaptation via Capacity Expansion","primary_cat":"cs.LG","submitted_at":"2026-02-26T11:55:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CeRA overcomes LoRA's linear ceiling by injecting non-linear SiLU gating and dropout, outperforming high-rank LoRA on complex math reasoning with 1/8 the parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.04476","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Vision-aligned Latent Reasoning for Multi-modal Large Language Model","primary_cat":"cs.CV","submitted_at":"2026-02-04T12:04:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VaLR generates vision-aligned latent tokens before each reasoning step to preserve perceptual cues, improving VSI-Bench accuracy from 33.0% to 52.9%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.20856","ref_index":173,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"NVIDIA Nemotron 3: Efficient and Open Intelligence","primary_cat":"cs.CL","submitted_at":"2025-12-24T00:24:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"NVIDIA releases the Nemotron 3 model family with hybrid Mamba-Transformer architecture, LatentMoE, NVFP4 training, MTP layers, and multi-environment RL post-training for reasoning and agentic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.06965","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EDUMATH: Generating Standards-aligned Educational Math Word Problems","primary_cat":"cs.CL","submitted_at":"2025-10-08T12:53:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EDUMATH introduces the first teacher-annotated dataset for standards-aligned math word problem generation and demonstrates that it enables smaller open LLMs to match larger models while producing problems students prefer over human-written ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2506.05640","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FedShield-LLM: A Secure and Scalable Federated Fine-Tuned Large Language Model","primary_cat":"cs.CR","submitted_at":"2025-06-06T00:05:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"FedShield-LLM integrates pruning and FHE on LoRA parameters to support secure, scalable federated fine-tuning of LLMs such as Llama-2.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2504.05299","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SmolVLM: Redefining small and efficient multimodal models","primary_cat":"cs.AI","submitted_at":"2025-04-07T17:58:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SmolVLM-256M outperforms a 300-times larger model using under 1 GB GPU memory, while the 2.2B version matches state-of-the-art VLMs at half the memory cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.16549","ref_index":74,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MathFlow: Enhancing the Perceptual Flow of MLLMs for Visual Mathematical Problems","primary_cat":"cs.CV","submitted_at":"2025-03-19T11:46:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MathFlow decouples perception and inference stages in MLLMs for visual math, with a dedicated perception model delivering gains on the FlowVerse benchmark when paired with existing reasoners.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.16982","ref_index":55,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Muon is Scalable for LLM Training","primary_cat":"cs.LG","submitted_at":"2025-02-24T09:12:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Muon optimizer with weight decay and update scaling achieves ~2x efficiency over AdamW for large LLMs, shown via the Moonlight 3B/16B MoE model trained on 5.7T tokens.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02871","ref_index":240,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Position: Multimodal Large Language Models Can Significantly Advance Scientific Reasoning","primary_cat":"cs.CL","submitted_at":"2025-02-05T04:05:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Position paper claims multimodal LLMs can significantly advance scientific reasoning and proposes a four-stage roadmap plus challenges and suggestions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.02737","ref_index":120,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model","primary_cat":"cs.CL","submitted_at":"2025-02-04T21:43:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SmolLM2 is a 1.7B-parameter language model that outperforms Qwen2.5-1.5B and Llama3.2-1B after overtraining on 11 trillion tokens using custom FineMath, Stack-Edu, and SmolTalk datasets in a multi-stage pipeline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.01456","ref_index":144,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Process Reinforcement through Implicit Rewards","primary_cat":"cs.LG","submitted_at":"2025-02-03T15:43:48+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRIME enables online process reward model updates in LLM RL using implicit rewards from rollouts and outcome labels, yielding 15.1% average gains on reasoning benchmarks and surpassing a stronger instruct model with 10% of the data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2501.19201","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Efficient Reasoning with Hidden Thinking","primary_cat":"cs.CL","submitted_at":"2025-01-31T15:10:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Heima compresses verbose CoT into hidden thinking tokens via information-theoretic analysis and an adaptive interpreter, claiming maintained or improved zero-shot accuracy on reasoning benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.18104","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Training and Evaluating Language Models with Template-based Data Generation","primary_cat":"cs.CL","submitted_at":"2024-11-27T07:32:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"TDG uses GPT-4 to generate meta-templates that synthesize over 7 million verifiable grade school math problems for training and aligning LLMs on reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.18629","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Step-DPO: Step-wise Preference Optimization for Long-chain Reasoning of LLMs","primary_cat":"cs.LG","submitted_at":"2024-06-26T17:43:06+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-DPO performs preference optimization on individual reasoning steps rather than complete answers, producing nearly 3% accuracy gains on MATH for 70B+ parameter models with 10K preference pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.08464","ref_index":159,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing","primary_cat":"cs.CL","submitted_at":"2024-06-12T17:52:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Magpie synthesizes 300K high-quality alignment instructions from Llama-3-Instruct via auto-regressive prompting on partial templates, enabling fine-tuned models to match official instruct performance on AlpacaEval, ArenaHard, and WildBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2405.04434","ref_index":75,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","primary_cat":"cs.CL","submitted_at":"2024-05-07T15:56:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepSeek-V2 delivers top-tier open-source LLM performance using only 21B active parameters by compressing the KV cache 93.3% and cutting training costs 42.5% via MLA and DeepSeekMoE.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"def is_not_prime(n): result = False for i in range(2,int(math.sqrt(n)) + 1): if n % i == 0: result = True return result [DONE] You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests: assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] [BEGIN] import heapq as hq def heap_queue_largest(nums,n): largest_nums = hq.nlargest(n, nums) return largest_nums [DONE] You are an expert Python programmer, and here is your task: Write a function"},{"citing_arxiv_id":"2403.14624","ref_index":64,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual Math Problems?","primary_cat":"cs.CV","submitted_at":"2024-03-21T17:59:50+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MathVerse is a benchmark that tests multi-modal LLMs on visual math by providing each problem in six versions with progressively less diagram and text information to measure true visual understanding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.13116","ref_index":37,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"A Survey on Knowledge Distillation of Large Language Models","primary_cat":"cs.CL","submitted_at":"2024-02-20T16:17:37+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A comprehensive survey of knowledge distillation for LLMs structured around algorithms, skill enhancement, and vertical applications, highlighting data augmentation as a key enabler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2402.03300","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","primary_cat":"cs.CL","submitted_at":"2024-02-05T18:55:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepSeekMath 7B reaches 51.7% on MATH via continued pretraining on curated web math data and Group Relative Policy Optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.02954","ref_index":86,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"DeepSeek LLM: Scaling Open-Source Language Models with Longtermism","primary_cat":"cs.CL","submitted_at":"2024-01-05T18:59:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"DeepSeek LLM 67B exceeds LLaMA-2 70B on code, mathematics and reasoning benchmarks after pre-training on 2 trillion tokens and alignment via SFT and DPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.08935","ref_index":93,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Math-Shepherd: Verify and Reinforce LLMs Step-by-step without Human Annotations","primary_cat":"cs.AI","submitted_at":"2023-12-14T13:41:54+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Math-Shepherd is an automatically trained process reward model that scores solution steps to verify and reinforce LLMs, lifting Mistral-7B from 77.9% to 89.1% on GSM8K and 28.6% to 43.5% on MATH.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.10631","ref_index":201,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Llemma: An Open Language Model For Mathematics","primary_cat":"cs.CL","submitted_at":"2023-10-16T17:54:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Continued pretraining of Code Llama on Proof-Pile-2 yields Llemma, an open math-specialized LLM that beats known open base models on MATH and supports tool use plus formal proving out of the box.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2309.17452","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ToRA: A Tool-Integrated Reasoning Agent for Mathematical Problem Solving","primary_cat":"cs.CL","submitted_at":"2023-09-29T17:59:38+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ToRA trains language models on interactive tool-use trajectories with imitation learning and output shaping to integrate reasoning and external tools, yielding 13-19% gains on math datasets and new highs like 44.6% on MATH for a 7B model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}