{"total":18,"items":[{"citing_arxiv_id":"2606.26698","ref_index":266,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Beyond Logical Forms: LLM-Extracted Patterns for Fallacy Classification","primary_cat":"cs.CL","submitted_at":"2026-06-25T07:30:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"LLM-extracted patterns merging logical structures and linguistic cues yield statistically significant gains in fallacy classification over zero-shot baselines with cross-dataset generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25462","ref_index":253,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Optimizing Abstractive Summarization With Fine-Tuned PEGASUS","primary_cat":"cs.CL","submitted_at":"2026-06-24T06:43:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Fine-tuned PEGASUS achieves state-of-the-art ROUGE scores on XL-Sum English corpus with 4.04% ROUGE-1, 15.25% ROUGE-2, and 3.39% ROUGE-L gains over mT5 baseline.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.25380","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Survey of Toxicity Detection and Mitigation Strategies for Multilingual Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-24T04:24:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":1.0,"formal_verification":"none","one_line_summary":"A survey that catalogs threat models, detection approaches, and mitigation strategies for toxicity in multilingual LLMs while identifying challenges such as uneven language coverage and culturally variable harm definitions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.23204","ref_index":250,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Unmasking LAION-5B: Age, Gender, Race, and Emotion Biases in Large-Scale Image Datasets","primary_cat":"cs.CV","submitted_at":"2026-06-22T11:49:23+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Empirical audit of LAION-2B-en and LAION-2B-multi finds overrepresentation of young adults, White people, and males plus stereotypical emotion associations across two attribute classifiers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.07190","ref_index":256,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"From Correctness to Utility: Gain-Based Prefix Evaluation for LLM Reasoning","primary_cat":"cs.CL","submitted_at":"2026-06-05T11:56:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Prefix gain measured via student-model solve-rate improvement is used to train a Prefix Utility Model (PUM) that supplies stronger supervision than correctness-based process rewards for mathematical reasoning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06349","ref_index":85,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"\"Chi nas dal soch el sent de legn\" -- Auditing Text Corpora for Lombard","primary_cat":"cs.CL","submitted_at":"2026-06-04T16:20:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Manual audit shows web-scraped Lombard corpora are largely noisy and biased toward Western varieties over Eastern ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06088","ref_index":4,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"CHALIS: A Challenge Dataset for Language Identification in Difficult Scenarios","primary_cat":"cs.CL","submitted_at":"2026-06-04T12:26:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces CHALIS benchmark dataset testing language ID on mutually intelligible cousin language pairs and orthographically noisy inputs, with evaluation showing existing systems struggle substantially.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.01806","ref_index":257,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"ProbeScale: Probing Analysis to Optimize Neural Scaling Laws for Efficient Small Language Model Inference","primary_cat":"cs.CL","submitted_at":"2026-06-01T07:24:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ProbScale finds layer subsets in SLMs like RoBERTa-Large and T5-Base that cut parameters 5-10x while retaining 95-98% of original task performance by maximizing aggregated probe scores under a budget.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00285","ref_index":14,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Model-Based Quality Assessment for Massively Multilingual Parallel Data","primary_cat":"cs.CL","submitted_at":"2026-05-29T19:19:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Large-scale benchmarks of multilingual embeddings and QE models show no universal performer; direction-aware routing and calibration recommended for parallel data assessment.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19194","ref_index":239,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MMoA: An AI-Agent framework with recurrence for Memoried Mixure-of-Agent","primary_cat":"cs.CL","submitted_at":"2026-05-18T23:47:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"MMoA adds LSTM recurrence to Mixture-of-Agents routing, reaching 58.0% win rate on AlpacaEval 2.0 versus 59.8% for baseline MoA while cutting runtime by up to 4.6%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.18122","ref_index":278,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Decisive: Guiding User Decisions with Optimal Preference Elicitation from Unstructured Documents","primary_cat":"cs.CL","submitted_at":"2026-04-20T11:42:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Decisive combines document-grounded option scoring with adaptive Bayesian preference elicitation to achieve up to 20% higher decision accuracy than LLMs and existing frameworks across domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.10995","ref_index":104,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"A Human-Centric Framework for Data Attribution in Large Language Models","primary_cat":"cs.CY","submitted_at":"2026-02-11T16:20:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces a parameter-driven framework for data attribution in LLMs that enables negotiation among creators, users, and intermediaries to meet stakeholder goals within the data economy.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Scammy AI-Generated Books Are Flooding Amazon.Wired(jan 2024). https://www.wired.com/story/scammy-ai-generated- books-flooding-amazon/ [103] Pang Wei Koh and Percy Liang. 2017. Understanding black-box predictions via influence functions. InProceedings of the 34th International Conference on Machine Learning - Volume 70(Sydney, NSW, Australia)(ICML'17). JMLR.org, 1885-1894. [104] Julia Kreutzer, Isaac Caswell, Lisa Wang, Ahsan Wahab, Daan van Esch, Nasanbayar Ulzii-Orshikh, Allahsera Tapo, Nishant Subramani, Artem Sokolov, Claytone Sikasote, Monang Setyawan, Supheakmungkol Sarin, Sokhar Samb, Benoît Sagot, Clara Rivera, Annette Rios, Isabel Papadimitriou, Salomey Osei, Pedro Ortiz Suarez, Iroro Orife, Kelechi Ogueji, Andre Niyongabo Rubungo, Toan Q."},{"citing_arxiv_id":"2411.05527","ref_index":29,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"How Good is Your Wikipedia? Auditing Data Quality for Low-resource and Multilingual NLP","primary_cat":"cs.CL","submitted_at":"2024-11-08T12:35:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The study filters non-English Wikipedia, reveals quality problems, proposes a 4-level ranking, and shows filtered data matches or beats raw data in language modeling with largest gains for lower-quality editions.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2406.04952","ref_index":18,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Quantifying Geospatial in the Common Crawl Corpus","primary_cat":"cs.CL","submitted_at":"2024-06-07T14:16:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Analysis estimates 18.7% of Common Crawl documents contain geospatial information like coordinates and addresses, with little difference by language.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.18796","ref_index":285,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Replacing Judges with Juries: Evaluating LLM Generations with a Panel of Diverse Models","primary_cat":"cs.CL","submitted_at":"2024-04-29T15:33:23+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A panel of smaller diverse LLMs outperforms a single large model as an evaluator of generations, showing less intra-model bias and over 7x lower cost.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2401.15391","ref_index":284,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries","primary_cat":"cs.CL","submitted_at":"2024-01-27T11:41:48+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MultiHop-RAG is a new benchmark dataset demonstrating that existing retrieval-augmented generation systems perform poorly on multi-hop queries requiring retrieval and reasoning over multiple evidence pieces.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2312.06681","ref_index":291,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"Steering Llama 2 via Contrastive Activation Addition","primary_cat":"cs.CL","submitted_at":"2023-12-09T04:40:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contrastive Activation Addition steers Llama 2 Chat by adding averaged residual-stream activation differences from contrastive example pairs to control targeted behaviors at inference time.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2204.06745","ref_index":50,"ref_count":1,"confidence":0.88,"is_internal_anchor":false,"paper_title":"GPT-NeoX-20B: An Open-Source Autoregressive Language Model","primary_cat":"cs.CL","submitted_at":"2022-04-14T04:00:27+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"GPT-NeoX-20B is a publicly released 20B parameter autoregressive language model trained on the Pile that shows strong gains in five-shot reasoning over similarly sized prior models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}