{"total":15,"items":[{"citing_arxiv_id":"2607.01223","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Theoria: Rewrite-Acceptability Verification over Informal Reasoning States","primary_cat":"cs.AI","submitted_at":"2026-07-01T17:56:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Theoria rewrites solutions into auditable typed state transitions with justifications, certifying 105 of 185 HLE problems at 91.4% precision and outperforming holistic judges on adversarial poisoned proofs by catching hidden premises.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.31748","ref_index":104,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Addressing Over-Refusal in LLMs with Competing Rewards","primary_cat":"cs.LG","submitted_at":"2026-06-30T14:38:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SEAR trains one LLM via adversarial process rewards to explore harmful reasoning paths but flip to safe outputs, reducing over-refusal while preserving safety.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28166","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Tandem Reinforcement Learning with Verifiable Rewards","primary_cat":"cs.AI","submitted_at":"2026-06-26T15:00:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"TRL extends tandem training to RLVR pipelines, matching GRPO solo reasoning on Qwen3-4B math tasks while improving handoff robustness, reducing distributional drift, and increasing CoT legibility for the junior.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.10684","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Divide and Cooperate: Role-Decomposed Multi-Agent LLM Training with Cross-Agent Learning Signals","primary_cat":"cs.LG","submitted_at":"2026-06-09T10:40:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DAC decomposes agentic search into cooperative searcher and generator agents with cross-agent signals (abstention reward and hard-positive augmentation), achieving strong QA benchmark performance via LoRA on a shared backbone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09711","ref_index":92,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Proxy Reward Internalization and Mechanistic Exploitation: A Learned Precursor to Reward Hacking and Its Generalization","primary_cat":"cs.AI","submitted_at":"2026-06-08T16:32:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Proxy RL produces a staged proxy-internalization capability that emerges before and predicts reward hacking in coding environments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00424","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Weak Critics Make Strong Learners: On-Policy Critique Distillation for Scalable Oversight","primary_cat":"cs.AI","submitted_at":"2026-05-29T23:21:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Weak models used as critics supplying non-misleading revision directions, distilled on-policy via OPCD, improve frozen and trained strong models on reasoning and alignment benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30290","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Self-Trained Verification for Training- and Test-Time Self-Improvement","primary_cat":"cs.LG","submitted_at":"2026-05-28T17:40:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Self-trained verification trains verifiers to imitate informed versions of themselves using reference solutions, improving test-time V-R loops and training-time self-improvement with reported gains of 2x on hard math and 14x on scientific reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28807","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Calibrating Conservatism for Scalable Oversight","primary_cat":"cs.AI","submitted_at":"2026-05-27T17:56:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CCO aggregates scoring functions into a calibrated penalty using conformal decision theory to enforce target violation rates for AI oversight on benchmarks like modified SWE-bench and MACHIAVELLI.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27914","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Does Capability Transfer to Subjective Behavior -- and Would Our Instruments Tell Us? A Self-Evolving, Trust-by-Construction Evaluation Paradigm","primary_cat":"cs.CL","submitted_at":"2026-05-27T03:41:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Self-evolving rubric with anti-gaming fitness reveals that objective capability scaling fails to transfer to subjective LLM behaviors, with advice-restraint as the universal lowest dimension that can regress.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22211","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CLORE: Content-Level Optimization for Reasoning Efficiency","primary_cat":"cs.AI","submitted_at":"2026-05-21T09:16:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CLORE augments correct on-policy rollouts by deleting repetitive and irrelevant segments then optimizes with auxiliary DPO to improve accuracy-efficiency trade-off on math benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20531","ref_index":14,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Pseudo-Formalization for Automatic Proof Verification","primary_cat":"cs.LO","submitted_at":"2026-05-19T22:08:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Pseudo-Formalization decomposes proofs into self-contained natural language modules for independent LLM-based Block Verification, outperforming LLM-as-judge baselines on olympiad and research math benchmarks while releasing ArxivMathGradingBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.10810","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Likelihood scoring for continuations of mathematical text: a self-supervised benchmark with tests for shortcut vulnerabilities","primary_cat":"cs.LG","submitted_at":"2026-05-11T16:32:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Presents a likelihood-based benchmark for equation-suffix prediction in technical papers with controls to detect shortcut vulnerabilities in model forecasts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13875","ref_index":207,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Common-agency Games for Multi-Objective Test-Time Alignment","primary_cat":"cs.GT","submitted_at":"2026-05-08T06:56:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CAGE uses common-agency games and an EPEC algorithm to compute equilibrium policies that balance multiple conflicting objectives for test-time LLM alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2503.11926","ref_index":76,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Monitoring Reasoning Models for Misbehavior and the Risks of Promoting Obfuscation","primary_cat":"cs.AI","submitted_at":"2025-03-14T23:50:34+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Chain-of-thought monitoring detects reward hacking in frontier reasoning models, but strong optimization against the monitor produces obfuscated misbehavior that remains hard to detect.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2412.21187","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs","primary_cat":"cs.CL","submitted_at":"2024-12-30T18:55:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"o1-like models overthink easy tasks; self-training reduces compute use without accuracy loss on GSM8K, MATH500, GPQA, and AIME.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}