{"total":204,"items":[{"citing_arxiv_id":"2606.26563","ref_index":31,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"scBench-Long: Verifiable Benchmarking of Long-Horizon Single-Cell Biology","primary_cat":"q-bio.GN","submitted_at":"2026-06-25T03:21:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"scBench-Long is a benchmark with 21 evaluations where the strongest AI model-harness pair succeeds on 25.4% of long-horizon single-cell biology tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.22748","ref_index":154,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AI Fiction in the Wild","primary_cat":"cs.CL","submitted_at":"2026-06-22T01:29:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Analysis of 500k ChatGPT logs shows over one-third of conversations generate fiction, dominated by power users with repetitive and niche patterns.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11522","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Search Discipline for Long-Horizon Research Agents","primary_cat":"cs.AI","submitted_at":"2026-06-09T23:55:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Aggregate metrics in research agents can invert rankings when validity is disaggregated, demonstrated on an ecosystem model task, motivating an external audit protocol over agent self-decision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.09637","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic Persona Generation with Critique-Refinement: An Industrial Evaluation","primary_cat":"cs.SE","submitted_at":"2026-06-08T15:34:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PerGent, an agentic critique-refinement system for persona generation, reaches 96.9% expert approval in an industrial evaluation at Kinaxis and reproduces more pre-LLM expert content than single-shot baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07006","ref_index":51,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RASFT: Rollout-Adaptive Supervised Fine-Tuning for Reasoning","primary_cat":"cs.LG","submitted_at":"2026-06-05T07:52:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RASFT is an adaptive SFT method that strengthens or relaxes expert imitation per problem based on on-policy rollout solvability and adds clipped reference-policy ratio to limit drift, reporting better results than standard SFT and RL on math and code benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06315","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LLM Self-Recognition: Steering and Retrieving Activation Signatures","primary_cat":"cs.AI","submitted_at":"2026-06-04T15:54:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Steering LLM residual streams with random sparse vectors creates detectable self-recognition fingerprints that enable over 98% accurate attribution of generated text to specific models without degrading output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01196","ref_index":52,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Low-Resource Safety Failures Are Action Failures, Not Representation Failures","primary_cat":"cs.CL","submitted_at":"2026-05-31T12:19:40+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Low-resource safety failures are action failures because the harmfulness representation transfers but the decision calibration does not; this is fixed by recalibrating a high-resource gate with 1-4 target-language examples.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01136","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Outliers to Errors: Auditing Pali-to-English LLM Translations with Multi-Reference Adjudication","primary_cat":"cs.CL","submitted_at":"2026-05-31T10:15:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A multi-reference audit framework for LLM translations of the Pali Canon uses embedding drift from a human reference centroid to triage candidates for LLM-judge adjudication, showing drift correlates with major error rates and model-specific differences in the high-drift tail.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.24894","ref_index":68,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"RWGBench: Evaluating Scholarly Positioning in Related Work Generation","primary_cat":"cs.DL","submitted_at":"2026-05-30T16:53:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RWGBench is a citation-centric benchmark for related work generation built from 40k CS papers and a 100-paper test set, with multi-dimensional metrics that better match human expert judgment than standard similarity scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31393","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Target-Side Paraphrase Augmentation for Sign Language Translation with Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-29T14:58:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM-generated target-side paraphrases improve BLEU-4 from 9.56 to 10.33 on PHOENIX14T for a pose-based Transformer in sign language translation, with limits observed on other datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.31080","ref_index":38,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Pilot Study on Curator-Guided Multilingual Art Description for Blind and Low-Vision Audiences with Small Vision-Language Models","primary_cat":"cs.MM","submitted_at":"2026-05-29T09:47:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pilot evaluation of language-specific versus multilingual LoRA adapters on Qwen2.5-VL-3B for curator-guided BLV art descriptions in three languages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30589","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ImmigrationQA: A Source-Grounded Dataset and Small-Model Adaptation for U.S. Immigration Law","primary_cat":"cs.CL","submitted_at":"2026-05-28T21:36:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A new source-grounded QA dataset for U.S. immigration law is built from official documents and used to fine-tune a 3B model, yielding a 27% mean score improvement over the base model on a held-out sample.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30448","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Bounded Behavioral Indistinguishability for Black-Box LLM Distillation","primary_cat":"cs.LG","submitted_at":"2026-05-28T18:19:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces (ε,q,t,A)-behavioral indistinguishability and shows via Qwen/Llama experiments that LoRA distillation boosts semantic similarity but leaves detectable behavioral differences under adversarial evaluation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30415","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Domain Adaptation and Reasoning Frameworks in Language Models: A Controlled Experiment with Historical Cosmology","primary_cat":"cs.CL","submitted_at":"2026-05-28T18:00:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Fine-tuning on historical cosmology data reshapes language model explanatory frameworks, leading to stance changes as a secondary effect from regime redistribution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29659","ref_index":44,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Opir: Efficient Multi-Task Safety Classification for Toxicity, Jailbreaks, Hate Speech, and Harmful Content","primary_cat":"cs.LG","submitted_at":"2026-05-28T09:21:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Opir introduces efficient multi-task encoder models trained on a 996-category safety taxonomy that match or exceed larger baselines on most safety benchmarks while using under 100M parameters for edge variants.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29522","ref_index":24,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DeepSurvey: Enhancing Analytical Depth and Citation Reliability in Automated Survey Generation","primary_cat":"cs.AI","submitted_at":"2026-05-28T07:40:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"UNKNOWN","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DeepSurvey introduces an agentic system for automated survey generation that improves depth through full-text keynotes, cross-paper clustering, and code analysis, while boosting citation reliability via graph expansion, hybrid filtering, and evidence-constrained assignment, with reported gains over ","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29343","ref_index":29,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Draft-OPD: On-Policy Distillation for Speculative Draft Models","primary_cat":"cs.CL","submitted_at":"2026-05-28T04:30:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Draft-OPD applies on-policy distillation via target-assisted generation and error replay to train speculative draft models, yielding over 5x lossless acceleration and gains over EAGLE-3 and DFlash.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.23497","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Asking For An Old Friend: Diagnosing and Mitigating Temporal Failure Modes in LLM-based Statutory Question Answering","primary_cat":"cs.CL","submitted_at":"2026-05-22T11:02:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs show severe staleness after training cutoffs and recency bias on historical German statutes; RAG with version filtering mitigates both better than web search.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.22089","ref_index":61,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"LVDrive: Latent Visual Representation Enhanced Vision-Language-Action Autonomous Driving Model","primary_cat":"cs.CV","submitted_at":"2026-05-21T07:31:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"LVDrive improves closed-loop driving on Bench2Drive by adding latent future scene prediction to VLA models via unified embedding space processing and two-stage trajectory decoding.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21622","ref_index":49,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"TO-Agents: A Multi-Agent AI Pipeline for Preference-Guided Topology Optimization","primary_cat":"cs.AI","submitted_at":"2026-05-20T18:32:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A multi-agent pipeline iteratively refines topology optimization outputs to match natural language preferences for branched structures, achieving 60% success rate across replicates in cantilever and phone-stand tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21347","ref_index":26,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Insights Generator: Systematic Corpus-Level Trace Diagnostics for LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-20T16:13:53+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20809","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Refining and Reusing Annotation Guidelines for LLM Annotation","primary_cat":"cs.CL","submitted_at":"2026-05-20T07:03:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"An iterative moderation framework refines and reuses annotation guidelines to improve LLM annotation accuracy on biomedical NER tasks across GPT, Gemini, and DeepSeek models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20506","ref_index":59,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reinforcing Human Behavior Simulation via Verbal Feedback","primary_cat":"cs.LG","submitted_at":"2026-05-19T21:23:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DITTO uses RL with verbal feedback to train LLMs for human behavior simulation, reporting 36% average gains over base models and outperforming GPT-5.4 on 6 of 10 SOUL benchmark tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20173","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Methodology for Selecting and Composing Runtime Architecture Patterns for Production LLM Agents","primary_cat":"cs.AI","submitted_at":"2026-05-19T17:54:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the stochastic-deterministic boundary (SDB) as a load-bearing primitive for LLM agent runtimes and provides a five-step methodology plus catalog of six patterns adapted from distributed systems.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20312","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Pramana: A Protocol-Layer Treatment of Claim Verification in Autonomous Agent Networks","primary_cat":"cs.CR","submitted_at":"2026-05-19T17:00:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pramana defines a typed ClaimAttestation protocol with four variants and verify operations, specifies its lifecycle in TLA+, model-checks it with TLC, and provides a tested Python implementation for auditable agent claims.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20033","ref_index":20,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Nash Equilibrium Framework For Training-Free Multimodal Step Verification","primary_cat":"cs.CV","submitted_at":"2026-05-19T15:54:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A Nash equilibrium framework for training-free multimodal step verification that uses cross-modal agreement and disagreement signals for filtering and ranking reasoning steps.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19394","ref_index":40,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"EmbGen: Teaching with Reassembled Corpora","primary_cat":"cs.CL","submitted_at":"2026-05-19T05:40:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"EmbGen creates synthetic QA data by entity decomposition, embedding-based reassembly into clusters, and multi-level sampling with cluster-specific prompts, yielding up to 88.9% higher Binary Accuracy than baselines on heterogeneous datasets under fixed token budgets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19193","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Sequential Consensus for Multi-Agent LLM Debates: A Wald-SPRT compute governor with calibration-based failure detection","primary_cat":"cs.LG","submitted_at":"2026-05-18T23:43:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Adapts SPRT as a compute governor for multi-agent LLM debates using Beta-modeled consensus scores from an LLM judge, yielding 3.7x call reduction on GSM8K at -2pp accuracy versus fixed rounds.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19099","ref_index":57,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"DecisionBench: A Benchmark for Emergent Delegation in Long-Horizon Agentic Workflows","primary_cat":"cs.AI","submitted_at":"2026-05-18T20:37:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"DecisionBench supplies a fixed task suite, model pool, delegation interface, and multi-axis metrics to evaluate emergent delegation, showing similar quality across awareness conditions but 15-31 point headroom under perfect delegation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.18630","ref_index":83,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SCICONVBENCH: Benchmarking LLMs on Multi-Turn Clarification for Task Formulation in Computational Science","primary_cat":"cs.AI","submitted_at":"2026-05-18T16:34:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"SCICONVBENCH is a new benchmark evaluating LLMs on multi-turn disambiguation and inconsistency resolution for task formulation in computational science, with frontier models reaching only 52.7% success on fluid mechanics disambiguation cases.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17774","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Internalizing Tool Knowledge in Small Language Models via QLoRA Fine-Tuning","primary_cat":"cs.CL","submitted_at":"2026-05-18T02:48:46+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17382","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"QQJ: Quantifying Qualitative Judgment for Scalable and Human-Aligned Evaluation of Generative AI","primary_cat":"cs.AI","submitted_at":"2026-05-17T10:53:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"QQJ is an evaluation framework that anchors LLM judges in expert rubrics and calibrates them on small high-quality annotation sets to improve alignment with human judgment on generative tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17342","ref_index":107,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Transitivity Meets Cyclicity: Explicit Preference Decomposition for Dynamic Large Language Model Alignment","primary_cat":"cs.CL","submitted_at":"2026-05-17T09:27:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces HRC model for game-theoretic decomposition of preferences into orthogonal transitive and cyclic components, paired with DSPPO for dynamic Nash-seeking alignment, reporting gains over BT and GPM baselines on RewardBench and downstream LLM evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17214","ref_index":48,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"ChemVA: Advancing Large Language Models on Chemical Reaction Diagrams Understanding","primary_cat":"cs.AI","submitted_at":"2026-05-17T01:12:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ChemVA framework uses hybrid-granularity visual anchors and entity-name alignment to improve LLM performance on chemical reaction diagrams by ~20 points, reaching 92% structural accuracy on the new OCRD-Bench dataset.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17173","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Why Do Safety Guardrails Degrade Across Languages?","primary_cat":"cs.CL","submitted_at":"2026-05-16T22:08:54+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A latent variable IRT framework decouples four safety-driving factors across 61 model configurations and 10 languages using 1.9 million evaluations, revealing that safety is largely unidimensional and that high cross-lingual gaps cluster in physical harm prompts and lower-resource languages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17041","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Agentic AI Translate: An Agentic Translator Prototype for Translation as Communication Design","primary_cat":"cs.CL","submitted_at":"2026-05-16T15:21:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Describes a conceptual agentic prototype for AI translation that operationalizes skopos theory and GEMBA-MQM verification into a four-stage cycle with user dialogue and memory for coherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16712","ref_index":13,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Recall Isn't Enough: Bounding Commitments in Personalized Language Systems","primary_cat":"cs.AI","submitted_at":"2026-05-15T23:50:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CBEA with LCV bounds evidence sets and validates commitments before response generation, achieving zero failures in scoped tests at 0.49-0.60 availability versus near-zero for baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16551","ref_index":23,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"PQR: A Framework to Generate Diverse and Realistic User Queries that Elicit QA Agent Failures","primary_cat":"cs.CL","submitted_at":"2026-05-15T18:50:43+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15865","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"From Text to DSL: Evaluating Grammar-Based Model Generation Using Open LLMs","primary_cat":"cs.SE","submitted_at":"2026-05-15T11:33:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Compact open-source LLMs can produce syntactically valid, semantically complete, and inter-model consistent DSL models from text via few-shot prompting, with some 7B-12B models matching much larger ones in quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16462","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Asking Back: Interaction-Layer Antidistillation Watermarks","primary_cat":"cs.CR","submitted_at":"2026-05-15T08:28:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Interaction-layer antidistillation watermarks use system-prompt-induced behavioral markers like explicit follow-up questions that transfer to distilled student models at 45-89% relative fidelity and can be audited via black-box LLM-as-judge queries.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15482","ref_index":8,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"FINESSE-Bench: A Hierarchical Benchmark Suite for Financial Domain Knowledge and Technical Analysis in Large Language Models","primary_cat":"cs.CL","submitted_at":"2026-05-14T23:53:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"FINESSE-Bench is a new hierarchical benchmark suite combining certification-style exams, trading tasks, and a Russian olympiad set to evaluate LLMs on financial competencies at multiple difficulty levels.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15365","ref_index":80,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Greedy or not, here I come: Language production under vocabulary constraints in humans and resource-rational models","primary_cat":"cs.CL","submitted_at":"2026-05-14T19:45:02+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Humans produce language more like greedy local choices than globally optimal planning when vocabulary is tightly constrained, with skilled speakers showing more revision.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15343","ref_index":34,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Belief Engine: Configurable and Inspectable Stance Dynamics in Multi-Agent LLM Deliberation","primary_cat":"cs.AI","submitted_at":"2026-05-14T19:13:12+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Belief Engine is a configurable belief-update mechanism for multi-agent LLM systems that uses structured argument extraction and log-odds stance updates to make evidence-grounded deliberation inspectable and controllable.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15300","ref_index":30,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Deep Pre-Alignment for VLMs","primary_cat":"cs.CV","submitted_at":"2026-05-14T18:14:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Deep Pre-Alignment uses a small VLM perceiver instead of ViT to pre-align visual features with LLM text space, yielding 1.9-3.0 point gains on multimodal benchmarks and 32.9% less language forgetting.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14890","ref_index":22,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Tokenizer Fertility and Zero-Shot Performance of Foundation Models on Ukrainian Legal Text: A Comparative Study","primary_cat":"cs.CL","submitted_at":"2026-05-14T14:35:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Tokenizer fertility varies 1.6x across models on Ukrainian legal text, Qwen uses 60% more tokens than Llama-family models, zero-shot outperforms few-shot by up to 26 points, and pre-war classifiers lose 27.9 points on invasion-era decisions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14062","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Know When To Fold 'Em: Token-Efficient LLM Synthetic Data Generation via Multi-Stage In-Flight Rejection","primary_cat":"cs.AI","submitted_at":"2026-05-13T19:35:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MSIFR stops faulty LLM generations early via staged rule-based checks, reducing token consumption 11-78% with no accuracy loss.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14057","ref_index":40,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Dual Hierarchical Dialogue Policy Learning for Legal Inquisitive Conversational Agents","primary_cat":"cs.CL","submitted_at":"2026-05-13T19:29:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A dual hierarchical RL framework with two agents coordinates high-level dialogue strategy and low-level question generation to emulate judicial questioning and extract key information from Supreme Court arguments, outperforming baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16411","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Reducing Hallucination in Vision-Language Models via Stage-wise Preference Optimization under Distribution Shift","primary_cat":"cs.CV","submitted_at":"2026-05-13T15:37:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Stage-wise DPO constructs hallucination-focused preference pairs near failure boundaries to improve visual grounding in VLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16410","ref_index":16,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Test-Time Hinting for Black-Box Vision-Language Models","primary_cat":"cs.CV","submitted_at":"2026-05-13T14:35:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Test-Time Hinting trains a hint generator to prepend contextual guidance to VLM prompts, improving accuracy on natural-image VQA benchmarks with generalization to unseen tasks and models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.13139","ref_index":43,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"SWE-Cycle: Benchmarking Code Agents across the Complete Issue Resolution Cycle","primary_cat":"cs.SE","submitted_at":"2026-05-13T08:05:16+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SWE-Cycle benchmark shows sharp drops in code agent success rates from isolated tasks to full autonomous issue resolution, highlighting cross-phase dependency issues.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}