{"total":338,"items":[{"citing_arxiv_id":"2606.17474","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AIPatient Arena: EHR-grounded evaluation of large language models in end-to-end clinical consultation workflows","primary_cat":"cs.CL","submitted_at":"2026-06-16T03:35:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AIPatient Arena is an EHR-grounded multi-turn evaluation framework for LLMs in clinical consultations that scores models on eight competence dimensions across 437+ patients, finding strengths in questioning and ethics but weaknesses in diagnostic reasoning and ambiguity handling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12594","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pythagoras-Prover: Advancing Efficient Formal Proving via Augmented Lean Formalisation","primary_cat":"cs.AI","submitted_at":"2026-06-10T18:43:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Pythagoras-Prover family achieves 93.0% on MiniF2F-Test with a 32B model and has its 4B version surpass a 671B prior model at pass@32, using ALF data augmentation and curriculum training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07779","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Do Vision-Language Models See Dwarf Galaxies the Way We Do?","primary_cat":"astro-ph.IM","submitted_at":"2026-06-05T18:45:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Zero-shot VLMs reproduce aggregate human annotations on dwarf galaxy detection but exhibit high per-example variability and unreliable self-reported confidence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06416","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Unsupervised Skill Discovery for Agentic Data Analysis","primary_cat":"cs.AI","submitted_at":"2026-06-04T17:20:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DataCOPE uses verifier-guided contrastive distillation from agent trajectories to discover skills, yielding average gains of 9.71% on report-style and 32.30% on reasoning-style data analysis tasks across four model settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06322","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DragOn: A Benchmark and Dataset for Drag-Based GUI Interactions","primary_cat":"cs.AI","submitted_at":"2026-06-04T15:57:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DragOn provides a new drag-grounding benchmark and training dataset for GUI agents, with evaluations suggesting potential improvements on computer-use tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05421","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ComplexityMT: Benchmarking the Interaction Between Text Complexity and Machine Translation","primary_cat":"cs.CL","submitted_at":"2026-06-03T20:38:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"ComplexityMT benchmark finds higher CEFR levels increase translation difficulty and MT systems often shift target CEFR levels versus source texts in most of six languages tested.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01476","ref_index":28,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniOPD: Logit-Free On-Policy Distillation via Speculative Verification","primary_cat":"cs.LG","submitted_at":"2026-05-31T22:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniOPD replaces token-level logit matching in on-policy distillation with Monte Carlo chunk-level semantic verification and a peak-entropy scheduler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01247","ref_index":67,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Where to Look: Can Foundation Models Reach a Target Viewpoint Through Active Exploration?","primary_cat":"cs.CV","submitted_at":"2026-05-31T14:00:10+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Introduces the TVR active viewpoint-matching task and TVRBench indoor simulation benchmark, where foundation models start at low single-digit success rates but reach 51.4% after visual-action SFT and multi-turn GRPO post-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01139","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SkillRevise: Improving LLM-Authored Agent Skills via Trace-Conditioned Skill Revision","primary_cat":"cs.AI","submitted_at":"2026-05-31T10:19:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SkillRevise iteratively refines initial LLM-generated agent skills using execution traces to diagnose defects and apply repairs, raising success rates from 36.05% to 61.63% on SkillsBench across three benchmarks and five LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01091","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Research as Rubric for Reinforcement Learning","primary_cat":"cs.CL","submitted_at":"2026-05-31T08:25:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DR-rubric is a two-stage framework using iterative agentic search to generate atomic verifiable constraints for GRPO-based RL, achieving competitive performance on 6 benchmarks with 1K-3K examples via bootstrap or frontier-model rubrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00750","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"I-WebGenBench : Evaluating Interactivity in LLM-Generated Scientific Web Applications","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:34:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A Paper-to-Interactive-System Agent and I-WebGenBench benchmark with 19 papers enable converting scientific PDFs into executable interactive web systems, with PaperVoyager framework shown to improve quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00708","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MOSAIC: Modular Orchestration for Structured Agentic Intelligence and Composition","primary_cat":"cs.AI","submitted_at":"2026-05-30T12:31:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MOSAIC structures LLM-based model selection via memory-grounded blueprints and failure-aware RL, reporting gains in performance and traceability on financial time-series tasks over AutoML and agent baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00622","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MM-Snowball: Evaluating and Mitigating Hallucination Snowballing in Multimodal Multi-Turn Dialogue","primary_cat":"cs.CV","submitted_at":"2026-05-30T08:53:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MM-Snowball benchmark diagnoses hallucination snowballing in multi-turn MLLM dialogues; CAVR mitigates it via dual visual rectification at representation and logit levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00547","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning to Retrieve: Dual-Level Long-Term Memory for Text-to-SQL Agents","primary_cat":"cs.CL","submitted_at":"2026-05-30T05:44:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MERIT maintains episode-level and turn-level memories with RL-optimized retrieval and a process reward model, outperforming baselines on BIRD-Interact with positive transfer to Spider2-Snow.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31093","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cross-Modal Clinical Knowledge Integration for Mammography Report Generation","primary_cat":"cs.CV","submitted_at":"2026-05-29T10:04:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"MODERATE","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MammoRG integrates cross-modal prior clinical knowledge and BI-RADS terminology via two-stage training to generate mammography reports with higher clinical consistency than prior direct image-to-text methods.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31075","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Task-Focused Memorization for Multimodal Agents","primary_cat":"cs.CV","submitted_at":"2026-05-29T09:43:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TaskMem uses RL in two phases to learn a task-focused memorization policy for multimodal agents, yielding 5.3-7.0% VQA accuracy gains on reformulated streaming benchmarks from VideoMME, EgoLife, and EgoTempo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30963","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AMix-2: Establishing Protein as a Native Modality in Large Language Models","primary_cat":"q-bio.BM","submitted_at":"2026-05-29T07:58:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AMix-2 unifies protein sequences and text in one LLM via shared tokens and block-wise diffusion modeling, introduces the ProteinArena benchmark, and reports competitive performance against task-specific protein models and frontier LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30961","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGens: A Population-Based Heuristic Search Framework for Scientific Idea Generation","primary_cat":"cs.CL","submitted_at":"2026-05-29T07:56:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EvoGens uses rank-based mutation, semantic-aware crossover, and lightweight evaluation to evolve populations of LLM-generated scientific ideas, boosting novelty and diversity metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30883","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TRACE: Task-Aware Adaptive Self-Evolving Agentic Jailbreaking","primary_cat":"cs.CR","submitted_at":"2026-05-29T06:13:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TRACE is a task-aware adaptive self-evolving jailbreaking framework that achieves up to 100% bypass rates on LLM agents via subtask decomposition and scenario evolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30834","ref_index":62,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Hide-and-Seek in Trajectories: Discovering Failure Signals for VLA Runtime Monitoring","primary_cat":"cs.RO","submitted_at":"2026-05-29T04:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hide-and-Seek uses contrastive objectives on trajectories to localize failure signals in VLA models from trajectory-level supervision alone.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30738","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MAVEN: Improving Generalization in Agentic Tool Calling","primary_cat":"cs.AI","submitted_at":"2026-05-29T02:02:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"MAVEN is a modular verification scaffold that lifts an open 120b model's tool-calling accuracy from 48% to 71% on MAVEN-Bench without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30693","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Triaging Threats to Specialized Guardrails","primary_cat":"cs.CR","submitted_at":"2026-05-29T00:36:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces GuardZoo benchmark and RouteGuard router-expert system showing monolithic guardrails suffer task interference while specialized routing improves threat detection and generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30637","ref_index":84,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EHRBench: An Automated and Reliable EHR-based Benchmark for Clinical Decision Making with LLMs","primary_cat":"cs.AI","submitted_at":"2026-05-28T22:38:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EHRBench uses an EHR-LLM-KB pipeline to automatically create 960,067 reliable QA items spanning diagnosis, treatment, and prognosis for large-scale LLM evaluation in clinical decision making.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30599","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AMNESIA: A Large Scale Medical Unlearning Benchmark Suite with Disease-Informed Analysis","primary_cat":"cs.LG","submitted_at":"2026-05-28T21:46:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"AMNESIA is a benchmark suite of 70,560 medical QA pairs that evaluates unlearning methods and shows that patient-level unlearning erodes disease-shared knowledge.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30569","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Any-ttach: Quick End-effector Swapping Enables Manipulation Dexterity with Simplicity","primary_cat":"cs.RO","submitted_at":"2026-05-28T21:00:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Any-ttach shows that rapid end-effector swapping combined with demonstration collection and task planning enables reliable multi-tool skills in long-horizon tasks such as sandwich making.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30557","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing Isn't Knowing: Do VLMs Know When Not to Answer Spatial Questions (and Why)?","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:44:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Frontier VLMs overconfidently answer spatial questions under occlusion (~30% accuracy) and perspective ambiguity (<10% accuracy) instead of abstaining, and often fail to select helpful additional views.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30241","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CommunityFact: A Dynamic, Multilingual, Multi-domain Benchmark for Misinformation Detection in the Wild","primary_cat":"cs.CL","submitted_at":"2026-05-28T17:09:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CommunityFact provides a new dynamic benchmark showing web access improves LLM misinformation detection but source selection remains misaligned with human Community Notes raters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30219","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Should Models Change Their Minds? Contextual Belief Management in Large Language Models","primary_cat":"cs.AI","submitted_at":"2026-05-28T16:52:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces BeliefTrack benchmark diagnosing three CBM failures in LLMs and shows RL with belief-state rewards cuts failure rates by 70.9% while representation steering cuts them by 46.1%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30161","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Far Looks Up: Probing Spatial Representation in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-28T16:18:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VLMs exhibit consistent vertical-distance entanglement in embeddings from perspective bias in natural images, producing accuracy gaps that a new synthetic benchmark SpatialTunnel exposes as model-intrinsic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30105","ref_index":58,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoRepair: Enhancing Vulnerability Repair Agents Through Experience-Based Self-Evolution","primary_cat":"cs.SE","submitted_at":"2026-05-28T15:46:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"EvoRepair is the first experience-based self-evolving agent framework for automated vulnerability repair, reporting 90.46% overall success on PATCHEVAL and SEC-bench benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29928","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Label Over Logic? How Source Cues Bias Human Fallacy Judgments More Than LLMs","primary_cat":"cs.HC","submitted_at":"2026-05-28T13:41:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Humans exhibit greater source-label bias in logical fallacy judgments than LLMs, which maintain more consistent evaluations regardless of source cues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00123","ref_index":47,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CardioLens: Revealing the Clinical Reality Gap of MLLMs via Multi-Sequence Cardiac MRI Evaluations","primary_cat":"cs.CV","submitted_at":"2026-05-28T11:03:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CardioLens is a leakage-resistant CMR testbed of 473k slices and 13k QA pairs showing current MLLMs exhibit a large clinical reality gap with category-collapse failures on real workflows.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29707","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Domino: Decoupling Causal Modeling from Autoregressive Drafting in Speculative Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-28T10:07:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Domino decouples causal dependency modeling from autoregressive draft execution via a parallel backbone plus lightweight causal head and a base-anchored training curriculum, reporting up to 5.49x speedup.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29548","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Larger Models Learn More: Effects of Capacity, Interference, and Rare-Task Retention","primary_cat":"cs.LG","submitted_at":"2026-05-28T08:02:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Larger models succeed on rare and complex tasks by reducing gradient interference from common tasks, allowing rare-task features to accumulate, as shown via synthetic task mixtures and OLMo pretraining from 4M to 4B parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29522","ref_index":56,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepSurvey: Enhancing Analytical Depth and Citation Reliability in Automated Survey Generation","primary_cat":"cs.AI","submitted_at":"2026-05-28T07:40:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepSurvey introduces an agentic system for automated survey generation that improves depth through full-text keynotes, cross-paper clustering, and code analysis, while boosting citation reliability via graph expansion, hybrid filtering, and evidence-constrained assignment, with reported gains over ","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29262","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Harmonizing Real-Time Constraints and Long-Horizon Reasoning: An Asynchronous Agentic Framework for Dynamic Scheduling","primary_cat":"cs.AI","submitted_at":"2026-05-28T02:26:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RACE-Sched is an asynchronous dual-stream agent framework combining low-latency symbolic heuristics with parallel LLM-based rule synthesis and sandbox validation for dynamic flexible job shop scheduling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28778","ref_index":69,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Can LLMs Use Linguistic Uncertainty Markers to Reliably Reflect Intrinsic Confidence?","primary_cat":"cs.CL","submitted_at":"2026-05-27T17:38:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs struggle to associate epistemic markers with stable internal confidence levels across distributions, even under model-centric interpretations, while maintaining somewhat consistent marker rankings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28465","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond One Path: Evaluating and Enhancing Divergent Thinking in Interactive LLM Agents","primary_cat":"cs.CL","submitted_at":"2026-05-27T13:33:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces MUTATE benchmark for path-level and action-level divergent thinking in LLM agents and ReDNA method that decouples divergent generation from convergent selection to improve performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28315","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HardMTBench: Stress-Testing Chinese-English Translation on Knowledge-Intensive Domains","primary_cat":"cs.CL","submitted_at":"2026-05-27T11:16:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HardMTBench is a difficulty-aware benchmark of 20,000 directional test items across 12 domains that widens GEMBA score ranges by a factor of two and reveals domain-specific weaknesses in 22 MT systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28305","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting Anthropomorphic Reflection Markers in Large Language Model Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-27T11:00:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Suppressing anthropomorphic reflection markers via prompt and token interventions preserves or improves LLM reasoning performance on four benchmarks while models continue marker-free verification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27918","ref_index":33,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Addressing Variable Heterogeneity in Distributed Multimodal Training with Entrain","primary_cat":"cs.DC","submitted_at":"2026-05-27T03:44:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entrain reduces microbatch workload variability by up to 10.6x and improves multimodal LLM training throughput by 1.4x via static model parallelism and deferred hierarchical microbatch assignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27898","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Unified Framework for the Evaluation of LLM Agentic Capabilities","primary_cat":"cs.AI","submitted_at":"2026-05-27T03:20:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A unified framework standardizes LLM agent benchmarks and demonstrates through 400K rollouts that scaffold and environment choices materially alter outcomes, enabling separation of intrinsic capabilities from artifacts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27824","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revealing Algorithmic Deductive Circuits for Logical Reasoning","primary_cat":"cs.AI","submitted_at":"2026-05-27T01:30:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Causal mediation analysis on symbolic-aided CoT prompting localizes ~3% of attention heads to factual/rule retrieval for sub-tasks and higher layers to integration of graph-traversal strategies.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23559","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PathNavigate: A Training-Free Pathology Agent with Surprise-Guided Scan and Shared Slide Memory for Whole-Slide Image VQA","primary_cat":"cs.CV","submitted_at":"2026-05-22T12:25:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PathNavigate introduces a scan-search-readout routine with surprise-guided low-mag scanning and shared slide memory to improve training-free WSI-VQA accuracy and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23420","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Naturalistic measure of social norms alignment","primary_cat":"cs.CL","submitted_at":"2026-05-22T09:29:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Proposes solution matching metrics (stated and explicit agreement accuracy) and a 3k Danish dilemma dataset to evaluate social norms alignment between LLMs and humans in naturalistic settings.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23258","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Simple Plug-in for Improving Eviction-Based KV Cache Compression","primary_cat":"cs.LG","submitted_at":"2026-05-22T06:00:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"VECTOR augments eviction-based KV cache compression with three-way token routing that combines importance scoring and offline regression-based reconstructability estimation to improve quality at high compression ratios.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23176","ref_index":89,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIVESPATIAL: A Benchmark for Spatiotemporal Intelligence in VLMs for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-22T02:52:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22907","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VideoOdyssey: A Benchmark for Ultra-Long-Context and Omni-Modal Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-21T18:00:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VideoOdyssey is a new benchmark featuring ultra-long videos (avg. 109 min) across 11 domains with multi-level continuous certificates (avg. 16 min for visual, 12.8 min for audio-visual) to diagnose MLLM limitations in continuous reasoning and omni-modal perception.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22902","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transcoders Trace Visual Grounding and Hallucinations in Vision-Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-21T17:34:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Transcoders decompose MLP layers in Gemma 3-4B-IT to trace visual grounding more effectively than SAEs and predict hallucinations from circuit graph features at AUC 0.68.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}