{"total":17,"items":[{"citing_arxiv_id":"2607.02296","ref_index":35,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Spatial Speech Perception Systems: A Survey of Sound Source Localization, Directional Enhancement, and Speech Recognition","primary_cat":"eess.AS","submitted_at":"2026-07-02T15:13:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"A survey of spatial speech perception systems covering sound source localization, directional enhancement, and automatic speech recognition methods and their integration.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.28857","ref_index":29,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"wav2VOT: Automatic estimation of voice onset time, closure duration, and burst realisation with wav2vec2","primary_cat":"cs.SD","submitted_at":"2026-06-27T10:47:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"wav2VOT shows wav2vec2 can estimate voice onset time and related stop consonant features with accuracy comparable to existing tools on unseen data and higher accuracy after fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.21411","ref_index":43,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"CoughPhase-CLR: Designing an acoustics-informed foundation model for coughing sound classification","primary_cat":"cs.SD","submitted_at":"2026-06-19T13:26:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CoughPhase-CLR uses cough physiological phases to build contrastive positive pairs, outperforming random cropping on downstream tasks including COVID-19 detection and COPD classification.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.16009","ref_index":133,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Bridging the Usability Gap: Lessons from Interpreting Studies for Machine Interpreting Design","primary_cat":"cs.CL","submitted_at":"2026-06-14T20:41:59+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Machine interpreting should shift from fidelity metrics to three design priorities—agency, grounding, and experience—drawn from interpreting studies to close the usability gap with human-mediated communication.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11180","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Lip Forcing: Few-Step Autoregressive Diffusion for Real-time Lip Synchronization","primary_cat":"cs.CV","submitted_at":"2026-06-09T17:56:36+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"Lip Forcing distills a 14B bidirectional video diffusion teacher into autoregressive students that achieve real-time lip synchronization at 31 FPS using two denoising steps without CFG.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.06357","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"F3-Tokenizer: Taming Audio Autoencoder Latents for Understanding and Generation","primary_cat":"cs.SD","submitted_at":"2026-06-04T16:25:07+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"F3-Tokenizer adapts audio autoencoder latents with noise-regularized bottleneck (channel normalization and stochastic perturbation) and a representation encoder (RQ-MTP plus frozen-LLM supervision) to support both high-dimensional understanding representations and normalized continuous generation ta","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.29531","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Audio Deepfake Detection with Half-Truth Localisation Using Cross-Attentive Feature Fusion","primary_cat":"cs.SD","submitted_at":"2026-05-28T07:47:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"CAFNet performs joint ternary classification and temporal boundary regression for half-truth audio deepfakes via cross-attentive fusion of MFCC, LFCC, and Chroma-STFT features, reporting 92.71% accuracy and 0.075s MAE on MLADDC T2+T3.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.14855","ref_index":46,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Exploitation of Hidden Context in Dynamic Movement Forecasting: A Neural Network Journey from Recurrent to Graph Neural Networks and General Purpose Transformers","primary_cat":"cs.LG","submitted_at":"2026-05-14T14:02:45+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":2.0,"formal_verification":"none","one_line_summary":"Empirical comparison of LSTM, GNN, and Transformer architectures for NBA trajectory forecasting finds hybrid LSTM with contextual information yields lowest FDE of 1.51m over horizons up to 2s.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23385","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Domain-Adapted Fine-Tuning of ECG Foundation Models for Multi-Label Structural Heart Disease Screening","primary_cat":"cs.LG","submitted_at":"2026-04-25T17:27:42+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Domain-adapted ECG foundation models with self-supervised pretraining and selective fine-tuning reach macro-AUROC 0.8509 for multi-label structural heart disease detection on the EchoNext benchmark.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19797","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Enhancing ASR Performance in the Medical Domain for Dravidian Languages","primary_cat":"eess.AS","submitted_at":"2026-04-10T09:41:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A hybrid confidence-aware ASR training framework with learnable weights reduces Telugu medical WER from 24.3% to 15.8% and Kannada from 31.7% to 25.4%, outperforming standard fine-tuning.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.02941","ref_index":38,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"MMTalker: Multiresolution 3D Talking Head Synthesis with Multimodal Feature Fusion","primary_cat":"cs.CV","submitted_at":"2026-04-03T10:17:39+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.11298","ref_index":2,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Voxtral Realtime","primary_cat":"cs.AI","submitted_at":"2026-02-11T19:17:10+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Voxtral Realtime is an end-to-end trained streaming ASR model that achieves Whisper-level transcription quality at 480ms delay after scaling pretraining across 13 languages.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16632","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Step-Audio 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.00037","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Moshi: a speech-text foundation model for real-time dialogue","primary_cat":"eess.AS","submitted_at":"2024-09-17T17:55:39+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Moshi is the first real-time full-duplex spoken large language model that casts dialogue as speech-to-speech generation using parallel audio streams and an inner monologue of time-aligned text tokens.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"with Helium and pretrained on audio. When not using RQ-Transformer, we predict the 8 levels of tokens with independent classification heads, following Copet et al. (2023). Note that perplexities are only comparable between models with a given delay, as the classification task is easier with more delay for higher tokens. Acoustic Delay RQ-Transformer Perplexity [0, 1, 2, 3, 4, 5, 6, 7] 42 .2 [0, 1, 2, 3, 4, 5, 6, 7] ✓ 40.3 [0, 2, 2, 2, 2, 2, 2, 2] 135 .4 [0, 2, 2, 2, 2, 2, 2, 2] ✓ 36.8 Table 6: Ablation study on delay patterns, weight of the semantic token and Inner Monologue. All models are initialized with Helium, pretrained on audio and use the RQ-Transformer. We vary the weight of the semantic token while keeping the weight"},{"citing_arxiv_id":"2208.03299","ref_index":119,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Atlas: Few-shot Learning with Retrieval Augmented Language Models","primary_cat":"cs.CL","submitted_at":"2022-08-05T17:39:22+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Atlas reaches over 42% accuracy on Natural Questions with only 64 examples, outperforming a 540B-parameter model by 3% with 50x fewer parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2112.09118","ref_index":91,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Unsupervised Dense Information Retrieval with Contrastive Learning","primary_cat":"cs.IR","submitted_at":"2021-12-16T18:57:37+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Contrastive learning trains unsupervised dense retrievers that beat BM25 on most BEIR datasets and support cross-lingual retrieval across scripts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2107.03374","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Evaluating Large Language Models Trained on Code","primary_cat":"cs.LG","submitted_at":"2021-07-07T17:41:24+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Codex achieves 28.8% pass@1 on HumanEval, rising to 70.2% with 100 samples per problem via repeated sampling.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}