{"total":740,"items":[{"citing_arxiv_id":"2606.25561","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CrypFormBench: Benchmarking Formal Analysis Capability of Large Language Models for Cryptographic Schemes","primary_cat":"cs.CR","submitted_at":"2026-06-24T08:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrypFormBench is a new benchmark jointly covering symbolic and computational security to evaluate LLMs on five formal analysis capabilities, with results showing top model Claude-3.5 scores 48.7/100 and most models struggling on generation, transformation, and correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20173","ref_index":141,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Qiskit Code Migration with LLMs","primary_cat":"cs.SE","submitted_at":"2026-06-18T12:40:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A taxonomy-guided RAG system with LLMs reduces hallucinations and improves migration suggestions for Qiskit code compared to unconstrained retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.19025","ref_index":92,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FoMoE: Breaking the Full-Replica Barrier with a Federation of MoEs","primary_cat":"cs.LG","submitted_at":"2026-06-17T12:50:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FoMoE partitions expert layers across workers in MoE LLMs, skips non-resident experts, and reports up to 1.42x lower communication than baselines plus 1.4x throughput gains while maintaining stable routing.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11167","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Multi-Faceted Interactivity Alignment in Full-Duplex Speech Models","primary_cat":"cs.CL","submitted_at":"2026-06-09T17:46:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-axis RL alignment technique improves pause handling, turn-taking, backchanneling, and interruption response in full-duplex spoken dialogue models by optimizing axis-specific rewards derived from human audio segments.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08620","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPA: A SQL-Plan-Aware Reinforcement Learning Framework for Query Rewriting with LLMs","primary_cat":"cs.DB","submitted_at":"2026-06-07T13:12:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPA trains LLMs via plan-aware RL with adaptive reward shaping and self-improvement on slowdowns to produce faster query rewrites than rule-based or standard LLM methods on IID and OOD workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01476","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniOPD: Logit-Free On-Policy Distillation via Speculative Verification","primary_cat":"cs.LG","submitted_at":"2026-05-31T22:31:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniOPD replaces token-level logit matching in on-policy distillation with Monte Carlo chunk-level semantic verification and a peak-entropy scheduler.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01414","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agent Skills Should Go Beyond Text: The Case for Visual Skills","primary_cat":"cs.CV","submitted_at":"2026-05-31T19:22:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper proposes that reusable agent skills should incorporate visual elements alongside text, introduces three forms of visual skills and an automatic conversion system, and reports better performance on GUI and visual-centric tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01326","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reducing Token Usage of State-in-Context Agents using Minification","primary_cat":"cs.SE","submitted_at":"2026-05-31T16:24:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"Code minification reduces average input token usage by 42% in state-in-context agents with a 12 percentage point drop in resolution rate on SWE-bench Verified.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01213","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TECCI: Tricky Edits of Collected and Curated Images","primary_cat":"cs.CV","submitted_at":"2026-05-31T13:03:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TECCI benchmark reveals that five leading text-guided image editing models all achieve under 22% overall success rate on challenging edits, with particular struggles on architecture, nature, reasoning, and creative tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01075","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"On the Generalization Gap in Self-Evolving Language Model Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-31T07:43:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Closed-loop self-evolution on LLMs improves reasoning on Knights and Knaves tasks but plateaus short of oracle-supervised levels, with multi-turn revision nearly matching it for large models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00931","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CV-Arena: An Open Benchmark for Instructional Computer Vision Problem Solving with Human-AI Collaborative Preferences","primary_cat":"cs.CV","submitted_at":"2026-05-30T23:37:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CV-Arena is a new 12K-pair benchmark for instruction-guided real-image editing with 16 task types, CogRetriever curation, and Active Elo mixed human-AI evaluation that finds gaps in 21 models and presents CV-Agent.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00765","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FALAT: Tracing Failures in LLM Agent Trajectories via Dependency-Guided Search","primary_cat":"cs.AI","submitted_at":"2026-05-30T15:11:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FALAT improves failure attribution in LLM agent trajectories via dependency-guided search, achieving 46.0% step-level accuracy on algorithm-generated and 29.1% on hand-crafted trajectories in the Who&When benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00750","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"I-WebGenBench : Evaluating Interactivity in LLM-Generated Scientific Web Applications","primary_cat":"cs.CL","submitted_at":"2026-05-30T14:34:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A Paper-to-Interactive-System Agent and I-WebGenBench benchmark with 19 papers enable converting scientific PDFs into executable interactive web systems, with PaperVoyager framework shown to improve quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00651","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MESA: Improving MoE Safety Alignment via Decentralized Expertise","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:54:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MESA decentralizes safety duties in MoE LLMs via expert capacity reallocation and dynamic routing refinement based on optimal transport theory, yielding robust defense on harmful benchmarks while preserving helpfulness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00622","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MM-Snowball: Evaluating and Mitigating Hallucination Snowballing in Multimodal Multi-Turn Dialogue","primary_cat":"cs.CV","submitted_at":"2026-05-30T08:53:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MM-Snowball benchmark diagnoses hallucination snowballing in multi-turn MLLM dialogues; CAVR mitigates it via dual visual rectification at representation and logit levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00564","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decomposed On-Policy Distillation for Vision-Language Reasoning: Steering Gradients for Visual Grounding","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:34:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decomposes VLM distillation loss into orthogonal language and visual components and introduces Visual Gradient Steering to prioritize visual grounding over standard monolithic optimization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00562","ref_index":42,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepLatent: Think with Images via Parallel Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepLatent introduces a parallel latent visual reasoning framework with learnable 2D tokens and continuous RL, trained via distillation then RL, plus a new 180K dataset, claiming SOTA benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00507","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LaSR: Context-Aware Speech Recognition via Latent Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-30T03:44:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LaSR improves context-aware terminology recognition in speech LLMs by aligning latent CoT supervision on acoustic regions and introducing latent reasoning periods, shown on a new academic corpus to outperform standard fine-tuning without added latency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00241","ref_index":209,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"InfoAtlas: A Foundation Model for Zero-Shot Statistical Dependence Estimate","primary_cat":"cs.LG","submitted_at":"2026-05-29T18:16:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"InfoAtlas is a pretrained neural model for zero-shot mutual information estimation that matches state-of-the-art accuracy with 100x speedup and handles varying dimensions via a single model.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31598","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Linear Scaling Video VLMs for Long Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-29T17:59:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"StateKV is an inference-time technique that replaces quadratic self-attention prefill in video VLMs with a fixed-capacity importance-based recurrent state, keeping accuracy near full attention on long-video benchmarks without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31455","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIFT: Decoupled Rollouts and Importance-Weighted Fine-Tuning for Efficient Multi-Turn Optimization","primary_cat":"cs.LG","submitted_at":"2026-05-29T15:49:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DRIFT achieves multi-turn RL performance via offline importance-weighted SFT by leveraging the equivalence of KL-regularized RL to weighted supervised learning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31377","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DynaTree: Dynamic Agentic Retrieval Tree for Time-Sensitive News Retrieval","primary_cat":"cs.IR","submitted_at":"2026-05-29T14:45:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DynaTree separates offline agentic tree construction from online subtree selection to deliver better recall, ranking, and production survival rates than standard or prior agentic RAG for news retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31251","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ERGeoBench:A Comprehensive Benchmark for Embodied Reasoning and Geo-localization in Multimodal Large Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-29T12:49:17+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ERGeoBench is a new diagnostic benchmark evaluating MLLMs on four capabilities in three progressive embodied geo-localization settings, finding that models handle high-level semantics but struggle with fine-grained perception and metric localization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31075","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Task-Focused Memorization for Multimodal Agents","primary_cat":"cs.CV","submitted_at":"2026-05-29T09:43:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"TaskMem uses RL in two phases to learn a task-focused memorization policy for multimodal agents, yielding 5.3-7.0% VQA accuracy gains on reformulated streaming benchmarks from VideoMME, EgoLife, and EgoTempo.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30931","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MineExplorer: Evaluating Open-World Exploration of MLLM Agents in Minecraft","primary_cat":"cs.CL","submitted_at":"2026-05-29T07:21:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MineExplorer is a new benchmark for MLLM agents' open-world exploration in Minecraft, using task filtering, ReAct formulation, and multi-agent synthesis to create reliable multi-hop instances.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30824","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Planner-Centric Reinforcement Learning for Deep Research with Structure-Aware Reward","primary_cat":"cs.AI","submitted_at":"2026-05-29T04:18:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DecomposeR represents research plans as typed DAGs and uses two-stage planner-then-answerer RL to improve long-form research performance by 5.1-8.0 points over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30713","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Diversity Matters: Revisiting Test-Time Compute in Vision-Language Models","primary_cat":"cs.LG","submitted_at":"2026-05-29T01:06:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entropy-based test-time compute (ETTC) in VLM ensembles outperforms majority voting by prioritizing high-confidence predictions from stronger models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00140","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Geometric Erasure by Contrastive Velocity Matching in Rectified Flows","primary_cat":"cs.LG","submitted_at":"2026-05-29T00:02:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"GEM bridges trajectory-based unlearning and teacher-guided erasure to create a geometric guidance objective for targeted concept suppression in Rectified Flow models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30561","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VLM3: Vision Language Models Are Native 3D Learners","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:48:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Standard VLMs achieve expert-level 3D performance on depth estimation, pose estimation, and object understanding via three simple techniques without architecture changes or regression losses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30557","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing Isn't Knowing: Do VLMs Know When Not to Answer Spatial Questions (and Why)?","primary_cat":"cs.CV","submitted_at":"2026-05-28T20:44:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Frontier VLMs overconfidently answer spatial questions under occlusion (~30% accuracy) and perspective ambiguity (<10% accuracy) instead of abstaining, and often fail to select helpful additional views.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30311","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Archon: A Unified Multimodal Model for Holistic Digital Human Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:53:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Archon unifies seven modalities via modality-specific tokenizers and an autoregressive backbone pretrained on 72 tasks, plus a 4x-efficient video reparameterization and stepwise 'Thinking in Modality' procedure, and reports superior or comparable results on digital-human tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30307","ref_index":105,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Grounded 3D-Aware Spatial Vision-Language Modeling","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:51:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GR3D is a VLM that combines explicit 2D, implicit 2D, and monocular 3D grounding mechanisms to improve performance on spatial understanding benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30256","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VideoFDB: Evaluating Full-Duplex Vision-Speech Capabilities in Conversational Agents","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:20:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"VideoFDB is a new benchmark and LM-as-judge framework for evaluating full-duplex audio-visual-to-audio-visual conversational agents on nonverbal dynamics from real video calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30248","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GenClaw: Code-Driven Agentic Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:13:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GenClaw introduces a three-stage code-driven workflow for agentic image generation that inserts programmatic sketches between linguistic reasoning and pixel synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30241","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CommunityFact: A Dynamic, Multilingual, Multi-domain Benchmark for Misinformation Detection in the Wild","primary_cat":"cs.CL","submitted_at":"2026-05-28T17:09:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CommunityFact provides a new dynamic benchmark showing web access improves LLM misinformation detection but source selection remains misaligned with human Community Notes raters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30161","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Far Looks Up: Probing Spatial Representation in Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-28T16:18:01+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VLMs exhibit consistent vertical-distance entanglement in embeddings from perspective bias in natural images, producing accuracy gaps that a new synthetic benchmark SpatialTunnel exposes as model-intrinsic.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30027","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DocRetriever: A Plug-and-Play Framework for Multimodal Document Retrieval with Comprehensive Benchmark","primary_cat":"cs.CV","submitted_at":"2026-05-28T14:50:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"DocRetriever introduces a framework using layout-aware sparse embeddings for hybrid encoding without OCR and a generalizable reasoning-augmented reranker for few-shot settings, plus the MultiDocR benchmark for evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29963","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Honeyval: A Comprehensive Evaluation Framework for LLM-powered HTTP Honeypots","primary_cat":"cs.CR","submitted_at":"2026-05-28T14:02:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Honeyval evaluates LLM HTTP honeypots with AI attackers and shows they produce longer interactions, lower detection rates, and cost advantages over rule-based baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29928","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Label Over Logic? How Source Cues Bias Human Fallacy Judgments More Than LLMs","primary_cat":"cs.HC","submitted_at":"2026-05-28T13:41:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Humans exhibit greater source-label bias in logical fallacy judgments than LLMs, which maintain more consistent evaluations regardless of source cues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29833","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniMatBench: A Human-Calibrated Multimodal Reasoning Benchmark Across 19 Materials Science Subfields","primary_cat":"cs.AI","submitted_at":"2026-05-28T12:12:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniMatBench is a new human-calibrated benchmark for multimodal materials-science reasoning that reveals the best evaluated MLLM scores only 0.372 overall.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29734","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HTAM: Hierarchical Transition-Attended Memory for Operator Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-28T10:29:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"HTAM builds a Hierarchical Transition Graph to organize coarse global directions and detailed local strategies for guiding LLM-based CUDA kernel optimization, improving results on KernelBench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29727","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bastion: Budget-Aware Speculative Decoding with Tree-structured Block Diffusion Drafting","primary_cat":"cs.LG","submitted_at":"2026-05-28T10:21:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BASTION is a budget-aware speculative decoding framework with adaptive tree-structured block diffusion drafting that reports up to 6.61x speedup and 39% improvement over block-diffusion baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29555","ref_index":27,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Blind Guess to Informed Judgment: Teaching LLMs to Evaluate Materials by Building Knowledge-Augmented Preference Signals","primary_cat":"cs.CL","submitted_at":"2026-05-28T08:09:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MaterEval generates paired informed and blind evaluations as preference signals to improve small open-source LLMs on high-entropy alloy assessment, approaching closed-source performance without external retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29523","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"K-FinHallu: A Hallucination Detection Benchmark for Multi-Turn RAG in Korean Finance","primary_cat":"cs.LG","submitted_at":"2026-05-28T07:40:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"K-FinHallu is the first multi-turn Korean financial RAG hallucination benchmark; frontier LLMs struggle especially on justified abstention while an 8B fine-tuned model reaches competitive performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29250","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniRetrieval: Unified Retrieval across Heterogeneous Knowledge Sources","primary_cat":"cs.CL","submitted_at":"2026-05-28T02:10:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniRetrieval dispatches natural-language queries to native engines across text, relational and graph sources without collapsing them into one representation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28480","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Audio-Mind: An Auditable Agentic Framework for Audio Understanding","primary_cat":"eess.AS","submitted_at":"2026-05-27T13:39:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Audio-Mind introduces a conditional, auditable agentic framework for audio understanding that preserves frontend judgment and acquires bounded external evidence only when needed, reporting 80.4% on MMAR and 82.8% on MSU-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23897","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ETCHR: Editing To Clarify and Harness Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:58:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A decoupled question-conditioned image editor trained via supervised imitation then VLM-reward enhancement improves MLLM visual reasoning Pass@1 by 4.6-5.5 points across models and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23668","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OnePred: Next-Query Prediction via Recursive Intent Memory in Multi-Turn Conversations","primary_cat":"cs.CL","submitted_at":"2026-05-22T14:16:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OnePred maintains a recursively updated intent memory and uses two-stage RL to predict next queries, cutting token use by up to 22x while outperforming baselines on a new NQP-Bench dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23518","ref_index":10,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VINS-120K: Ultra High-Resolution Image Editing with A Large-Scale Dataset","primary_cat":"cs.CV","submitted_at":"2026-05-22T11:33:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VINS-120K supplies the first large-scale set of instruction-image-edited-image triplets at ultra-high resolution together with an adaptation strategy that improves detail synthesis.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}