{"total":13,"items":[{"citing_arxiv_id":"2606.30237","ref_index":42,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Comparing Human and Automatic Recognition of Dutch Dysarthric Continuous Speech: A Case Study","primary_cat":"cs.CL","submitted_at":"2026-06-29T12:47:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Case study finds that fine-tuned ASR models outperform human listeners on Dutch dysarthric continuous speech from one speaker, lowering WER from over 70% to over 23%.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.00460","ref_index":71,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SALSA: Speech Aware LLM Adaptation via Learned Steering Activation Vectors","primary_cat":"cs.CL","submitted_at":"2026-05-30T00:54:53+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"SALSA adapts speech-aware LLMs via supervised layer-wise steering vectors, reporting up to 46.8% relative gains over zero-shot on out-of-domain speech benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.28253","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Building Community-Centred NLP Resources for Puno Quechua","primary_cat":"cs.CL","submitted_at":"2026-05-27T10:04:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"First dedicated ASR corpus of 66 hours and systematic benchmarks for Puno Quechua using participatory collection and open release of data and fine-tuned models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.26978","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"PashtoTTS-Bench: automated screening for low-resource non-Latin-script text-to-speech","primary_cat":"cs.CL","submitted_at":"2026-05-26T13:03:20+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Introduces INSV-A automated screening benchmark for Pashto TTS systems reporting WER, script fidelity, and LID results across five systems on FLEURS and Common Voice prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.19066","ref_index":78,"ref_count":3,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Annotation Scarcity Paradox in Low-Resource NLP Evaluation: A Decade of Acceleration and Emerging Constraints","primary_cat":"cs.CL","submitted_at":"2026-05-18T19:48:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"A narrative survey of low-resource NLP evaluation identifies the Annotation Scarcity Paradox as a structural mismatch between scalable models and scarce sociolinguistic evaluation capacity.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17846","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"UrduSpeech: A 156-Hour Urdu Speech Corpus with 12-Dimension Paralinguistic Annotations","primary_cat":"eess.AS","submitted_at":"2026-05-18T04:40:31+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"UrduSpeech is a 156-hour high-fidelity Urdu speech corpus with 12-dimension paralinguistic annotations, a 9-hour manually corrected benchmark, and open-source release to support speech technology for an under-resourced language.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.16896","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"JSPG: Dynamic Dictionary Filtering via Joint Semantic-Pinyin-Glyph Retrieval for Chinese Contextual ASR","primary_cat":"cs.CL","submitted_at":"2026-05-16T09:16:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"JSPG jointly combines semantic, pinyin, and glyph retrieval with an extended Smith-Waterman algorithm to dynamically filter keyword dictionaries and improve accuracy in Chinese contextual ASR.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.03590","ref_index":17,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"AfriVox-v2: A Domain-Verticalized Benchmark for In-the-Wild African Speech Recognition","primary_cat":"cs.CL","submitted_at":"2026-05-05T10:04:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AfriVox-v2 is a benchmark that evaluates modern speech models on in-the-wild African audio with domain-specific tests for sectors including government, finance, health, and agriculture.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.16287","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"NaijaS2ST: A Multi-Accent Benchmark for Speech-to-Speech Translation in Low-Resource Nigerian Languages","primary_cat":"cs.SD","submitted_at":"2026-04-17T17:49:46+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.10736","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"BlasBench: An Open Benchmark for Irish Speech Recognition","primary_cat":"cs.CL","submitted_at":"2026-04-12T17:17:54+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"BlasBench supplies an Irish-aware normalizer and scoring harness that enables reproducible ASR comparisons and exposes a 33-43 point generalization gap for fine-tuned models versus 7-10 points for massively multilingual ones.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09332","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Phonemes vs. Projectors: An Investigation of Speech-Language Interfaces for LLM-based ASR","primary_cat":"eess.AS","submitted_at":"2026-04-10T14:00:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Phoneme-based interfaces match or surpass projector-based ones for LLM ASR, especially in low-resource languages, and a BPE-phoneme hybrid offers additional improvements.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.04598","ref_index":6,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Benchmarking Multilingual Speech Models on Pashto: Zero-Shot ASR, Script Failure, and Cross-Domain Evaluation","primary_cat":"cs.CL","submitted_at":"2026-04-06T11:23:42+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Multilingual ASR models show 39.7-297% zero-shot WER on Pashto public data, Whisper models output correct script in under 0.8% of cases, and fine-tuned models degrade to 32.5-59% WER on out-of-domain sets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.00688","ref_index":57,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-01T09:45:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"OmniVoice introduces a diffusion language model-style non-autoregressive TTS system that directly maps text to multi-codebook acoustic tokens, scaling zero-shot synthesis to over 600 languages with SOTA results on multilingual benchmarks using 581k hours of open data.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"using word error rate (WER) or character error rate (CER) across different languages. For brevity and consistency, we refer to both WER and CER as WER for datasets using both metrics (Seed- TTS and MiniMax-Multilingual-24), while evaluating each language with its appropriate metric. Specifically, we use the Hubert-based ASR model [55] for LibriSpeech-PC test-clean, Paraformer- zh [56] for Chinese, the Omnilingual ASR model [57] for the FLEURS benchmark, and Whisper- large-v3 [58] for the remaining datasets. We also adopt UTMOS [59] to assess objective speech naturalness. These objective metrics are supplemented with subjective evaluations, including comparative mean opinion score (CMOS,[−3,3]) and similarity mean opinion score (SMOS,[0,5]), which measure human opinions on relative speech quality and absolute speaker similarity to the prompt audio."}],"limit":50,"offset":0}