{"total":20,"items":[{"citing_arxiv_id":"2606.25462","ref_index":21,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Optimizing Abstractive Summarization With Fine-Tuned PEGASUS","primary_cat":"cs.CL","submitted_at":"2026-06-24T06:43:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Fine-tuned PEGASUS achieves state-of-the-art ROUGE scores on XL-Sum English corpus with 4.04% ROUGE-1, 15.25% ROUGE-2, and 3.39% ROUGE-L gains over mT5 baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.13111","ref_index":106,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"M\\\"OVE: A Holistic LLM Benchmark for the German Public Sector","primary_cat":"cs.CL","submitted_at":"2026-06-11T09:37:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MÖVE presents a new German-language benchmark evaluating 39 LLMs on performance and governance criteria using ten public-administration datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.12807","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Detect, Remask, Repair: Diffusion Editing for Faithful Summarization of Evolving Contexts","primary_cat":"cs.CL","submitted_at":"2026-06-11T02:05:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diffusion-based localized editing framework for faithful summarization of evolving contexts, introducing the StreamSum benchmark and showing tradeoffs in faithfulness, speed, and preservation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07951","ref_index":116,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"From `May' to `Is': Certainty Distortion in Language Model Rewriting","primary_cat":"cs.CL","submitted_at":"2026-06-06T02:53:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LMs systematically inflate expressed certainty during rewriting, affecting up to 75% of outputs with a 1.5-2x bias toward increasing rather than decreasing certainty, and the effect compounds over iterations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06315","ref_index":60,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"LLM Self-Recognition: Steering and Retrieving Activation Signatures","primary_cat":"cs.AI","submitted_at":"2026-06-04T15:54:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Steering LLM residual streams with random sparse vectors creates detectable self-recognition fingerprints that enable over 98% accurate attribution of generated text to specific models without degrading output quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05054","ref_index":144,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Boosting Self-Consistency with Ranking","primary_cat":"cs.CL","submitted_at":"2026-06-03T16:12:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"RISC reformulates self-consistency answer selection as a ranking task solved by a lightweight LambdaRank model with five hand-designed features, yielding better accuracy-efficiency trade-offs than majority voting on QA benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.04612","ref_index":30,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Hybrid Adversarial Defence for Natural Language Understanding Tasks","primary_cat":"cs.CL","submitted_at":"2026-06-03T08:49:15+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Hybrid entropy-uncertainty-geometric defence improves clean accuracy by up to 43% and adversarial robustness by up to 65% on NLU and security benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.03924","ref_index":1,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Knowledge Editing in Masked Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-02T17:14:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Locate-then-edit succeeds at the same early-to-mid MLP locations in masked diffusion models as in autoregressive models, but requires optimization over intermediate partial-mask states to handle multi-token targets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29336","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Enhancing Factuality through Consensus and Consistency in Summarization Using Minimum Bayes Risk Decoding","primary_cat":"cs.CL","submitted_at":"2026-05-28T04:14:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ConSUM reranks candidate summaries using MBR consensus and source-consistency metrics to improve factuality over standard generation or reranking baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21993","ref_index":8,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ECPO: Evidence-Coupled Policy Optimization for Evidence-Certified Candidate Ranking","primary_cat":"cs.AI","submitted_at":"2026-05-21T04:42:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ECPO is a listwise policy optimization method that couples ranking utility with span-level evidence certificate validity and a deterministic verifier reward on MAVEN-ERE and RAMS datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.08590","ref_index":47,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Causal Stories from Sensor Traces: Auditing Epistemic Overreach in LLM-Generated Personal Sensing Explanations","primary_cat":"cs.HC","submitted_at":"2026-05-09T01:10:40+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LLMs routinely produce unsupported causal stories for personal sensing anomalies, and richer evidence or constrained prompts do not reliably eliminate this epistemic overreach.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"data into explanatory frames, rather than simply reading meaning directly from data [ 57]. This problem is amplified because sensed traces are partial, situated, and often only indirect proxies for lived experience [8, 15, 37]. At the same time, work on explanation shows that people often value explanations that are coherent, causal, and useful, even when those explanations simplify uncertainty or foreground selected causes [ 47]. This creates a particular risk for LLM-generated personal sensing explanations: the model may transform limited behavioral evidence into a fluent explanatory framing that feels personally meaningful, while obscuring the gap between what was observed and what can actually be inferred. We define this failure mode asepistemic overreach (EO): cases in which a generated explanation implies"},{"citing_arxiv_id":"2605.06006","ref_index":27,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"From Articles to Premises: Building PrimeFacts, an Extraction Methodology and Resource for Fact-Checking Evidence","primary_cat":"cs.CL","submitted_at":"2026-05-07T10:58:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"PrimeFacts extracts decontextualized premises from fact-check articles, raising evidence retrieval MRR by up to 30% and verdict prediction Macro-F1 by 10-20 points over baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19185","ref_index":22,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SCURank: Ranking Multiple Candidate Summaries with Summary Content Units for Enhanced Summarization","primary_cat":"cs.CL","submitted_at":"2026-04-21T07:51:05+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SCURank ranks multiple summary candidates with Summary Content Units to outperform ROUGE and LLM-based methods in summarization distillation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.07689","ref_index":26,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Stress Testing Factual Consistency Metrics for Long-Document Summarization","primary_cat":"cs.CL","submitted_at":"2025-11-10T23:24:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Short-form factual consistency metrics produce inconsistent scores on semantically equivalent long-document summaries and lose reliability on information-dense claims.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.05080","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Making Knowledge Accessible: Divergent Readability-Accuracy Strategies of Mistral and QWen in Biomedical Text Simplification","primary_cat":"cs.CL","submitted_at":"2025-11-07T08:53:39+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Mistral uses careful lexical simplification to raise readability while keeping BERTScore at 0.91 comparable to humans, whereas QWen improves readability but shows a disconnect with its 0.89 BERTScore in biomedical text simplification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.03568","ref_index":188,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Agent AI: Surveying the Horizons of Multimodal Interaction","primary_cat":"cs.AI","submitted_at":"2024-01-07T19:11:18+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"The paper defines Agent AI as interactive multimodal systems that perceive grounded data and generate embodied actions, arguing this approach can mitigate hallucinations in foundation models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.05232","ref_index":217,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions","primary_cat":"cs.CL","submitted_at":"2023-11-09T09:25:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"The paper surveys hallucination in LLMs with an innovative taxonomy, factors, detection methods, benchmarks, mitigation strategies, and open research directions.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"A foundational principle for assessing the faithfulness of generated text is anchored on the idea that genuinely faithful content should inherently be entailed by its source content. In line with this, numerous studies [ 82, 208] have trained classifiers on NLI datasets to identify factual inaccuracies, especially in the context of abstract summarization. However, Mishra et al . [217] highlighted that the mismatch in input granularity between conventional NLI datasets and inconsistency detection datasets limits their applicability for effectively detecting inconsistencies. Building on this, more advanced studies have proposed methods such as fine-tuning on adversarial datasets [17], decomposing the entailment decisions at the dependency arc level [101], and segmenting documents into sentence units then"},{"citing_arxiv_id":"2309.05922","ref_index":32,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Survey of Hallucination in Large Foundation Models","primary_cat":"cs.AI","submitted_at":"2023-09-12T02:34:06+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A survey classifying hallucination phenomena specific to large foundation models, establishing evaluation criteria, examining mitigation strategies, and discussing future directions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2303.08896","ref_index":24,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models","primary_cat":"cs.CL","submitted_at":"2023-03-15T19:31:21+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SelfCheckGPT detects hallucinations by checking consistency across multiple sampled responses from black-box LLMs on WikiBio biography generation tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2201.11903","ref_index":40,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models","primary_cat":"cs.CL","submitted_at":"2022-01-28T02:33:07+00:00","verdict":"ACCEPT","verdict_confidence":"HIGH","novelty_score":9.0,"formal_verification":"none","one_line_summary":"Chain-of-thought prompting, by including intermediate reasoning steps in few-shot examples, elicits strong reasoning abilities in large language models on arithmetic, commonsense, and symbolic tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}