{"total":19,"items":[{"citing_arxiv_id":"2606.31134","ref_index":28,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Beyond the Library: An Agentic Framework for Autoformalizing Research Mathematics","primary_cat":"cs.AI","submitted_at":"2026-06-30T05:05:03+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Agentic LLM framework autoformalizes 32 Putnam problems and main theorems plus proofs from five STOC papers into Lean 4, with two proofs using only kernel axioms.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04883","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Optimizing the Cost-Quality Tradeoff of Agentic Theorem Provers in Lean","primary_cat":"cs.CL","submitted_at":"2026-06-03T13:46:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"An agentic theorem prover in Lean uses a control plane to route actions based on cost and success estimates, achieving 28.9% lower average cost than a fixed-step baseline on a PutnamBench subset while preserving performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30914","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Automating Formal Verification with Reinforcement Learning and Recursive Inference","primary_cat":"cs.LG","submitted_at":"2026-05-29T06:59:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"RLVR training raises verified Dafny pass rates from 9.7% to 31.1% on a filtered benchmark while a Lean proof scaffold lifts success from 46.2% to 69.2% on a pilot set and solves 7 of 42 prior unsolved tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29955","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Formalizing Mathematics at Scale","primary_cat":"cs.AI","submitted_at":"2026-05-28T14:00:22+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A multi-agent framework called AutoformBot autoformalized 26 textbooks spanning analysis, algebra, topology, combinatorics and probability into a verified Lean 4 library of 45k declarations, demonstrating scalable formalization of graduate math.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23772","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Agentic Proving for Program Verification","primary_cat":"cs.AI","submitted_at":"2026-05-22T15:41:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Agentic Claude reaches 98.8% valid specs, 87.5% implementation certification, and 98.1% end-to-end success on CLEVER, revealing a mismatch between benchmark difficulty and current prover performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20244","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lean Refactor: Multi-Objective Controllable Proof Optimization via Agentic Strategy Search","primary_cat":"cs.LO","submitted_at":"2026-05-18T04:19:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Lean Refactor uses retrieval from a curated multi-objective strategy database to guide frozen LLMs in refactoring Lean proofs, reporting over 70% token compression on benchmarks and improved version transfer.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17283","ref_index":155,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OProver: A Unified Framework for Agentic Formal Theorem Proving","primary_cat":"cs.CL","submitted_at":"2026-05-17T06:39:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OProver-32B achieves top Pass@32 scores on MiniF2F, ProverBench, and PutnamBench by combining continued pretraining with iterative agentic proving, retrieval, SFT on repairs, and RL on unresolved cases using a 6.86M-proof dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17255","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CAM-Bench: A Benchmark for Computational and Applied Mathematics in Lean","primary_cat":"cs.AI","submitted_at":"2026-05-17T04:53:47+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CAM-Bench is a new Lean 4 theorem-proving benchmark of 1,000 problems in computational and applied mathematics, built from textbook exercises using a dependency-recovery pipeline to reconstruct local context.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11905","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Rethinking Supervision Granularity: Segment-Level Learning for LLM-Based Theorem Proving","primary_cat":"cs.AI","submitted_at":"2026-05-12T10:18:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Segment-level supervision extracts coherent proof segments to train policy models that achieve 61-66% success on miniF2F, outperforming step-level and whole-proof methods while also improving existing provers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16379","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"An Information-Theoretic Criterion for Efficient Data Synthesis","primary_cat":"cs.LG","submitted_at":"2026-05-11T01:27:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Synthetic data improves models only in information-open generation-training loops with external signals, and coarser signals like binary correctness enable better generalization by converging to the most information-efficient component.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08817","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"How You Begin is How You Reason: Driving Exploration in RLVR via Prefix-Tuned Priors","primary_cat":"cs.AI","submitted_at":"2026-05-09T09:10:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"IMAX trains soft prefixes with an InfoMax reward to drive diverse exploration in RLVR, yielding up to 11.60% gains in Pass@4 over standard RLVR across model scales.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Variational inference: A review for statisticians.Journal of the American Statistical Association, 112(518):859-877, 2017. [3] Hardy Chen, Haoqin Tu, Fali Wang, Hui Liu, Xianfeng Tang, Xinya Du, Yuyin Zhou, and Cihang Xie. Sft or rl? an early investigation into training r1-like reasoning large vision-language models.arXiv preprint arXiv:2504.11468, 2025. [4] Luoxin Chen, Jinming Gu, Liankai Huang, Wenhao Huang, Zhicheng Jiang, Allan Jie, Xiaoran Jin, Xing Jin, Chenggang Li, Kaijing Ma, et al. Seed-prover: Deep and broad reasoning for automated theorem proving.arXiv preprint arXiv:2507.23726, 2025. [5] Xi Chen, Yan Duan, Rein Houthooft, John Schulman, Ilya Sutskever, and Pieter Abbeel. Infogan: Interpretable representation learning by information maximizing generative adversarial nets."},{"citing_arxiv_id":"2605.06184","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Teaching LLMs Program Semantics via Symbolic Execution Traces","primary_cat":"cs.SE","submitted_at":"2026-05-07T13:01:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Training Qwen3-8B on symbolic execution traces from Soteria improves violation detection in C programs by over 17 points, transfers across five property types, and shows superadditive gains with chain-of-thought.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"A benchmark for vericoding: formally verified program synthesis.arXiv preprint arXiv:2509.22908, 2025. [8] Calcagno, C., Distefano, D., Dubreil, J., Gabi, D., Hooimeijer, P., Luca, M., O'Hearn, P., Papakonstantinou, I., Purbrick, J., and Rodriguez, D. Moving fast with software verification. In NASA Formal Methods Symposium, pp. 3-11. Springer, 2015. [9] Chen, L., Gu, J., Huang, L., Huang, W., Jiang, Z., Jie, A., Jin, X., Jin, X., Li, C., Ma, K., et al. Seed-prover: Deep and broad reasoning for automated theorem proving.arXiv preprint arXiv:2507.23726, 2025. [10] Chen, N., Li, Z., Bao, K., Lin, J., and Liu, D. Chain of execution supervision promotes general reasoning in large language models.arXiv preprint arXiv:2510."},{"citing_arxiv_id":"2604.24797","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Network Structure of Mathlib","primary_cat":"cs.LO","submitted_at":"2026-04-26T19:46:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Network analysis of Mathlib reveals 50.9% coupling between human taxonomies and logical dependencies, median 1.6% import usage by developers, and centrality driven by infrastructure rather than mathematical content.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.20209","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Scaling Self-Play with Self-Guidance","primary_cat":"cs.LG","submitted_at":"2026-04-22T05:50:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SGS adds self-guidance to LLM self-play for Lean4 theorem proving, surpassing RL baselines and enabling a 7B model to outperform a 671B model after 200 rounds.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"Result Buffer CPU Workers CPUs Slurm Job Result Buffer Task Buffer CPUs Slurm Job Result Buffer Task Buffer CPUs Slurm Job Result Buffer Task Buffer Done In flight Not Done (1) GPU Worker requests prompts when internal buffer < half full (2) Server sends prompts, marks them as \"in flight\" (3) Worker returns rollouts, server marks them as \"done\" (6) Server sends proofs, marks them as \"in flight\" Task Buffer Done In flight Not Done (5) CPU Worker requests proofs when internal buffer < half full (7) Worker returns verifications, server marks them as \"done\" (1) Worker reports it has died (due to error or preemption) (2) Server marks worker \"in flight\" tasks as \"not done\" (3) Server"},{"citing_arxiv_id":"2604.16347","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lean Atlas: An Integrated Proof Environment for Scalable Human-AI Collaborative Formalization","primary_cat":"cs.HC","submitted_at":"2026-03-16T14:19:33+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Lean Atlas visualizes Lean 4 dependency graphs and applies Lean Compass to reduce the nodes needing human semantic review by 27-99% across six evaluated projects.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.24273","ref_index":18,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Minimal Agent for Automated Theorem Proving","primary_cat":"cs.AI","submitted_at":"2026-02-27T18:43:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A minimal agentic system achieves competitive performance in automated theorem proving with a simpler design and lower cost than state-of-the-art methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.18857","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CORE: Concept-Oriented Reinforcement for Bridging the Definition-Application Gap in Mathematical Reasoning","primary_cat":"cs.AI","submitted_at":"2025-12-21T19:01:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CORE is a concept-oriented RL method that synthesizes quizzes, injects concept snippets into rollouts, and reinforces conceptual trajectories to close the gap between restating definitions and applying them in math problems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.12787","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Ax-Prover: A Deep Reasoning Agentic Framework for Theorem Proving in Mathematics and Quantum Physics","primary_cat":"cs.AI","submitted_at":"2025-10-14T17:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Ax-Prover is a tool-using multi-agent LLM system that matches state-of-the-art provers on public math benchmarks and outperforms them on new abstract-algebra and quantum-theory benchmarks while also assisting an expert with a cryptography proof.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.01346","ref_index":5,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Aristotle: IMO-level Automated Theorem Proving","primary_cat":"cs.AI","submitted_at":"2025-10-01T18:21:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Aristotle reaches gold-medal-equivalent performance on 2025 IMO problems via integrated Lean proof search, informal lemma formalization, and a dedicated geometry solver.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}