{"total":764,"items":[{"citing_arxiv_id":"2606.25561","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CrypFormBench: Benchmarking Formal Analysis Capability of Large Language Models for Cryptographic Schemes","primary_cat":"cs.CR","submitted_at":"2026-06-24T08:37:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CrypFormBench is a new benchmark jointly covering symbolic and computational security to evaluate LLMs on five formal analysis capabilities, with results showing top model Claude-3.5 scores 48.7/100 and most models struggling on generation, transformation, and correction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20728","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VTOS: Learning to Orchestrate Vision Tools by Co-Searching Solutions and Observers","primary_cat":"cs.CV","submitted_at":"2026-06-17T04:52:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"VTOS jointly searches solution and observer programs to adaptively orchestrate vision tools, outperforming static pipelines on dense object counting and zero-shot plant disease segmentation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.26130","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking Like a Scientist? A Structural Study of LLM-Generated Research Methods","primary_cat":"cs.CL","submitted_at":"2026-06-15T14:53:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLMs given only research questions from 1000 arXiv CS papers recommend a narrower set of methods than the original papers, with effective model-entity diversity dropping from 1232 to 59-96 and stronger agreement among LLMs than with papers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08620","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SPA: A SQL-Plan-Aware Reinforcement Learning Framework for Query Rewriting with LLMs","primary_cat":"cs.DB","submitted_at":"2026-06-07T13:12:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SPA trains LLMs via plan-aware RL with adaptive reward shaping and self-improvement on slowdowns to produce faster query rewrites than rule-based or standard LLM methods on IID and OOD workloads.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04340","ref_index":72,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Noisy memory encoding explains negative polarity illusions","primary_cat":"cs.CL","submitted_at":"2026-06-03T01:45:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Noisy memory encoding of determiners explains negative polarity illusions, with new acceptability experiments showing stronger illusions for similar determiner pairs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01481","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SafeGen-Bench: Benchmarking Safety in Image-Conditioned Text-to-Video Generation","primary_cat":"cs.CV","submitted_at":"2026-05-31T22:46:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SafeGen-Bench is a benchmark with 10 malicious categories that evaluates conditional T2V models on paired start frames and text prompts, finding unsafety scores up to 44.5 and 80% guardrail failure rate.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01394","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"UniD$^3$: A Knowledge Graph-Enhanced RAG Framework for Drug-Disease Discovery and Reasoning","primary_cat":"cs.CL","submitted_at":"2026-05-31T18:36:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"UniD³ applies KG-RAG with Llama 3.3-70B to build six knowledge graphs and generate large validated datasets for drug-disease matching, effectiveness assessment, and target analysis from biomedical literature.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01247","ref_index":50,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Where to Look: Can Foundation Models Reach a Target Viewpoint Through Active Exploration?","primary_cat":"cs.CV","submitted_at":"2026-05-31T14:00:10+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Introduces the TVR active viewpoint-matching task and TVRBench indoor simulation benchmark, where foundation models start at low single-digit success rates but reach 51.4% after visual-action SFT and multi-turn GRPO post-training.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01016","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PolySpeech-100: A Large-Scale Benchmark for Speech Understanding Across 100+ Languages and Dialects","primary_cat":"cs.CL","submitted_at":"2026-05-31T05:13:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PolySpeech-100 is a new benchmark for native-level speech comprehension across 110 linguistic variants that evaluates 22 models and reports E2E advantages on dialects, robustness gaps on low-resource languages, and degradation from Chain-of-Thought prompting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00592","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Through the PRISM: Principle-Aware, Interpretable, and Multi-Scale Evaluation of Visual Designs","primary_cat":"cs.CV","submitted_at":"2026-05-30T07:44:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PRISM benchmark perturbs Crello layouts into 110K samples isolating design principle violations, reveals limited sensitivity in several multimodal models, and proposes a multi-scale framework combining scorers, instruction-tuned VLMs, and prompt methods for interpretable design assessment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00570","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Revisiting Parameter-Based Knowledge Editing in Large Language Models: Theoretical Limits and Empirical Evidence","primary_cat":"cs.CL","submitted_at":"2026-05-30T06:44:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Parameter-based knowledge editing in LLMs induces reasoning collapse via dimensional collapse and is consistently outperformed by a retrieval baseline across varied edit counts, knowledge complexity, and evaluation metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00562","ref_index":115,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepLatent: Think with Images via Parallel Latent Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-30T06:33:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DeepLatent introduces a parallel latent visual reasoning framework with learnable 2D tokens and continuous RL, trained via distillation then RL, plus a new 180K dataset, claiming SOTA benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00510","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Skill or Skip? Learning Selective Skill Invocation in Agentic Tasks via Dual-Granularity Preference Learning","primary_cat":"cs.CL","submitted_at":"2026-05-30T04:00:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SelSkill applies dual-granularity preference learning to selective skill-or-skip decisions, improving task success by 10.9 points and execution precision by 29.1 points on ALFWorld with Qwen3-8B.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31529","ref_index":54,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SVI-Bench: A Dynamic Microworld for Strategic Video Intelligence","primary_cat":"cs.CV","submitted_at":"2026-05-29T16:43:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SVI-Bench is a 35K-hour sports video benchmark with 9 tasks across four cognitive pillars that reveals multimodal models drop from ~73% on action QA to 5% on agentic evidence-gathering tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31272","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Algorithmic Recourse of In-Context Learning for Tabular Data","primary_cat":"cs.LG","submitted_at":"2026-05-29T13:04:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"The paper delivers the first theoretical analysis and practical zeroth-order framework for algorithmic recourse under in-context learning for tabular prediction.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30961","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoGens: A Population-Based Heuristic Search Framework for Scientific Idea Generation","primary_cat":"cs.CL","submitted_at":"2026-05-29T07:56:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"EvoGens uses rank-based mutation, semantic-aware crossover, and lightweight evaluation to evolve populations of LLM-generated scientific ideas, boosting novelty and diversity metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30884","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"GUI-C$^2$: Coarse-to-Fine GUI Grounding via Difficulty-Aware Reinforcement Learning","primary_cat":"cs.CV","submitted_at":"2026-05-29T06:17:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"GUI-C² pairs a difficulty-scoring data pipeline with an area-gated coarse-to-fine RL mechanism to improve GUI grounding accuracy and training stability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00152","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PrivacyPeek: Auditing What LLM-Based Agents Acquire, Not Just What They Say","primary_cat":"cs.CR","submitted_at":"2026-05-29T04:55:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"PrivacyPeek is a benchmark with 1,182 cases across 7 acquisition behaviors and 16 domains that evaluates acquisition-stage privacy leakage in LLM agents, finding it widespread with limited prompt mitigation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00148","ref_index":36,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"StemBind: When MLLMs Get Lost Between Rules and Instances in Abstract Visual Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-29T03:20:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"StemBind benchmark diagnoses MLLM failures in abstract visual reasoning by separating perception, rule induction, and answer selection on shared stems, finding a persistent rule-to-instance binding gap even when perception and rule are correct.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11232","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Every Act Has Its Price: Compressed Moral Composition in Frontier LLMs","primary_cat":"cs.CL","submitted_at":"2026-05-29T02:36:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moral Trolley Arena shows frontier LLMs produce composite moral preferences that are compressed rather than additive functions of calibrated component act strengths across Moral Foundations Theory.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30717","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Neuron-Level Interventions for Gendered and Gender-Neutral Generation in Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-29T01:21:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Proposes a neuron-level intervention method to locate and control gender-specific neurons across feminine, masculine, and neutral categories in LMs, achieving precise steering with less leakage than prior approaches.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30346","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"YoCausal: How Far is Video Generation from World Model? A Causality Perspective","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:59:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"YoCausal benchmark shows video diffusion models detect the arrow of time but lack genuine causal understanding relative to humans.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30256","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"VideoFDB: Evaluating Full-Duplex Vision-Speech Capabilities in Conversational Agents","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:20:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"VideoFDB is a new benchmark and LM-as-judge framework for evaluating full-duplex audio-visual-to-audio-visual conversational agents on nonverbal dynamics from real video calls.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30231","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond 3D VQAs: Injecting 3D Spatial Priors into Vision-Language Models for Enhanced Geometric Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:00:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GASP injects geometric priors into VLMs via a deep-supervised correspondence head trained on video point correspondences and depth consistency, raising internal matching accuracy and delivering gains on spatial benchmarks without any 3D VQA data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30031","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Audio Jailbreaks in Large Audio-Language Models: Taxonomy, Attack-Defense Analysis, and Cost-Aware Evaluation","primary_cat":"cs.SD","submitted_at":"2026-05-28T14:53:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Organizes audio jailbreaks into semantic/acoustic/signal/embedding categories, evaluates representative attacks and defenses on ten LALMs with success rate plus latency and benign refusal, and concludes that acoustic attacks are potent while defenses trade robustness for usability.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29847","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EvoRubric: Self-Evolving Rubric-Driven RL for Open-Ended Generation","primary_cat":"cs.CL","submitted_at":"2026-05-28T12:28:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EvoRubric is a single-policy RL method that co-evolves a reasoner and a rubric generator with multi-level verification to produce dynamic rewards for open-ended LLM alignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29833","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"OmniMatBench: A Human-Calibrated Multimodal Reasoning Benchmark Across 19 Materials Science Subfields","primary_cat":"cs.AI","submitted_at":"2026-05-28T12:12:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"OmniMatBench is a new human-calibrated benchmark for multimodal materials-science reasoning that reveals the best evaluated MLLM scores only 0.372 overall.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29667","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Beyond English and Evasion: A Human-Annotated Multi-Domain Benchmark for High-Stakes LLM Safety Evaluation in Chinese","primary_cat":"cs.CL","submitted_at":"2026-05-28T09:28:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces ChiSafe-PAS, a 1,897-prompt human-annotated Chinese adversarial benchmark for LLM safety with 3-class labels, 9-category obfuscation taxonomy, and domain coverage in self-harm, drugs, fraud, and satire.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29555","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Blind Guess to Informed Judgment: Teaching LLMs to Evaluate Materials by Building Knowledge-Augmented Preference Signals","primary_cat":"cs.CL","submitted_at":"2026-05-28T08:09:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MaterEval generates paired informed and blind evaluations as preference signals to improve small open-source LLMs on high-entropy alloy assessment, approaching closed-source performance without external retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29523","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"K-FinHallu: A Hallucination Detection Benchmark for Multi-Turn RAG in Korean Finance","primary_cat":"cs.LG","submitted_at":"2026-05-28T07:40:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"K-FinHallu is the first multi-turn Korean financial RAG hallucination benchmark; frontier LLMs struggle especially on justified abstention while an 8B fine-tuned model reaches competitive performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29463","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Honest Lying: Understanding Memory Confabulation in Reflexive Agents","primary_cat":"cs.LG","submitted_at":"2026-05-28T06:56:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Reflexive agents confabulate incorrect task interpretations in memory, detected via Reflection Repetition Rate metric, with a programmatic mitigation raising correct object mentions from 0% to 86% in frozen ALFWorld cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29447","ref_index":1,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Recovering Policy-Induced Errors: Benchmarking and Trajectory Synthesis for Robust GUI Agents","primary_cat":"cs.CV","submitted_at":"2026-05-28T06:40:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces GUI-RobustEval benchmark and RoTS synthesis framework to train GUI agents on error recovery, with RoTS-32B reaching 47.4% success on OSWorld.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29421","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Learning Design Skills as Memory Policies for Agentic Photonic Inverse Design","primary_cat":"cs.CL","submitted_at":"2026-05-28T06:14:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SkillPCF is a closed-loop agent framework with a physics-guided memory skill bank, reinforcement-learned skill selection, and simulator-grounded evolution that improves design quality and efficiency for photonic crystal fiber inverse design under limited simulation budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29390","ref_index":21,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Orthogonal Negative Guidance in Attention Feature Space for Text-to-Image Generation","primary_cat":"cs.CV","submitted_at":"2026-05-28T05:43:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Orthogonal Negative Guidance subtracts only the orthogonal component of negative-prompt attention features from positive ones in FLUX models to suppress concepts while preserving semantics and quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29262","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Harmonizing Real-Time Constraints and Long-Horizon Reasoning: An Asynchronous Agentic Framework for Dynamic Scheduling","primary_cat":"cs.AI","submitted_at":"2026-05-28T02:26:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RACE-Sched is an asynchronous dual-stream agent framework combining low-latency symbolic heuristics with parallel LLM-based rule synthesis and sandbox validation for dynamic flexible job shop scheduling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28480","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Audio-Mind: An Auditable Agentic Framework for Audio Understanding","primary_cat":"eess.AS","submitted_at":"2026-05-27T13:39:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Audio-Mind introduces a conditional, auditable agentic framework for audio understanding that preserves frontend judgment and acquires bounded external evidence only when needed, reporting 80.4% on MMAR and 82.8% on MSU-Bench.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28175","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mixture-of-Experts Knowledge Graph Retrieval-Augmented Generation for Multi-Agent LLM-based Recommendation","primary_cat":"cs.IR","submitted_at":"2026-05-27T08:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MixRAGRec is a multi-agent KG-RAG framework with an MoE retrieval agent for query-specific granularity, a knowledge alignment agent, and a contrastive recommendation agent trained jointly via MMAPO.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27918","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Addressing Variable Heterogeneity in Distributed Multimodal Training with Entrain","primary_cat":"cs.DC","submitted_at":"2026-05-27T03:44:27+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Entrain reduces microbatch workload variability by up to 10.6x and improves multimodal LLM training throughput by 1.4x via static model parallelism and deferred hierarchical microbatch assignment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23897","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ETCHR: Editing To Clarify and Harness Reasoning","primary_cat":"cs.CV","submitted_at":"2026-05-22T17:58:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A decoupled question-conditioned image editor trained via supervised imitation then VLM-reward enhancement improves MLLM visual reasoning Pass@1 by 4.6-5.5 points across models and tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23826","ref_index":9,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Decomposing Queries into Tool Calls for Long-Video Keyframe Retrieval","primary_cat":"cs.CV","submitted_at":"2026-05-22T16:29:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"ToolMerge decomposes queries into LLM-planned tool calls merged by boolean operators for long-video keyframe retrieval and introduces the M2M benchmark, showing competitive results with 5% gains on caption retrieval.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23723","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"MemAudit: Post-hoc Auditing of Poisoned Agent Memory via Causal Attribution and Structural Anomaly Detection","primary_cat":"cs.AI","submitted_at":"2026-05-22T15:03:13+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MemAudit combines counterfactual causal influence scores with memory consistency graphs to identify poisoned records in LLM agent memory, reducing MINJA attack success from 70% to 0% in QA and 83.3% to 0% in reasoning tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23655","ref_index":5,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CVSearch: Empowering Multimodal LLMs with Cognitive Visual Search for High-Resolution Image Perception","primary_cat":"cs.CV","submitted_at":"2026-05-22T14:07:44+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CVSearch proposes an Assess-then-Search workflow combining expert-assisted search with Semantic Guided Adaptive Patching and Dynamic Bottom-Up Search to improve efficiency and accuracy on high-resolution image tasks for MLLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23559","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PathNavigate: A Training-Free Pathology Agent with Surprise-Guided Scan and Shared Slide Memory for Whole-Slide Image VQA","primary_cat":"cs.CV","submitted_at":"2026-05-22T12:25:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PathNavigate introduces a scan-search-readout routine with surprise-guided low-mag scanning and shared slide memory to improve training-free WSI-VQA accuracy and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23344","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CHASD: Language Increment-Calibrated Contrastive Decoding against Hallucination in LVLMs","primary_cat":"cs.CV","submitted_at":"2026-05-22T08:03:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CHASD is an inference-time framework that gates contrastive decoding via an uncertainty threshold and constructs negative branches through attention-guided perturbations of salient visual tokens to mitigate hallucinations in LVLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23176","ref_index":88,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DRIVESPATIAL: A Benchmark for Spatiotemporal Intelligence in VLMs for Autonomous Driving","primary_cat":"cs.CV","submitted_at":"2026-05-22T02:52:06+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22823","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Which Way Did It Move? Diagnosing and Overcoming Directional Motion Blindness in Video-LLMs","primary_cat":"cs.CV","submitted_at":"2026-05-21T17:59:56+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Video-LLMs exhibit directional motion blindness from a direction binding gap; DeltaDirect projector objective lifts synthetic accuracy to 85.4% and real accuracy by 21.9 points while preserving other video capabilities.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22819","ref_index":41,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Cambrian-P: Pose-Grounded Video Understanding","primary_cat":"cs.CV","submitted_at":"2026-05-21T17:59:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Cambrian-P adds per-frame camera pose tokens and a regression head to video MLLMs, delivering 4.5-6.5% gains on spatial benchmarks, generalization to other video QA tasks, and SOTA streaming pose estimation on ScanNet.","context_count":1,"top_context_role":"baseline","top_context_polarity":"baseline","context_text":"†indicates this Cambrian-Sis fine-tuned only on VSI-590K. Model LM Numerical Answer Multiple-Choice Answer Avg.Obj. Count Abs. Dist. Obj. Size Room SizeRel. Dist. Rel. Dir. Route Plan Appr. Order BaselinesChance Level (Random) - - - - - - 25.0 36.1 28.3 25.0Chance Level (Frequency) - 34.0 62.1 32.0 29.9 33.1 25.1 47.9 28.4 25.2 General-purpose ModelsGPT-4o [41] Unk. 34.0 46.2 5.3 43.8 38.2 37.0 41.3 31.5 28.5Gemini-2.5 Pro [25] Unk. 51.5 43.8 34.9 64.3 42.8 61.1 47.8 45.9 71.3Qwen2.5VL-7B [8] Qwen2.5-7B29.3 25.2 10.5 36.4 29.6 38.4 38.0 29.8 26.8InternVL-3 8B [129] Qwen2.5-7B42.1 68.1 39.0 48.4 33.6 48.3 36.4 27.3 35.4InternVL-3.5 8B [129] Qwen3-8B56.3 - - - - - - - -Qwen3-VL 8B [7] Qwen3-8B 56.6 - - - - - - - -"},{"citing_arxiv_id":"2605.22681","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Forecasting Scientific Progress with Artificial Intelligence","primary_cat":"cs.AI","submitted_at":"2026-05-21T16:23:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the CUSP benchmark across 4760 events and finds frontier AI models can pick plausible directions but fail to predict whether or when scientific advances will occur, with performance varying by domain and insensitive to training cutoffs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22678","ref_index":75,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Swift Sampling: Selecting Temporal Surprises via Taylor Series","primary_cat":"cs.CV","submitted_at":"2026-05-21T16:20:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Swift Sampling is a training-free frame selection method that uses Taylor expansions on video latent trajectories to pick temporally surprising frames, outperforming uniform sampling on long-video QA tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22654","ref_index":97,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seeing the Poem: Image-Semantic Detection of AI-Generated Modern Chinese Poetry with MLLMs","primary_cat":"cs.CL","submitted_at":"2026-05-21T15:57:04+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"An image-semantic guided method enhances MLLMs for detecting AI-generated modern Chinese poetry by combining poem text with visual representations of content, achieving 85.65% Macro-F1 with Gemini and outperforming text baselines and RoBERTa.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}