{"total":51,"items":[{"citing_arxiv_id":"2606.00635","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"How Neural Losses Shape VAE Latents","primary_cat":"cs.LG","submitted_at":"2026-05-30T09:20:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Neural reconstruction losses in VAEs reduce latent information content and produce more isotropic latent geometries with even uncertainty distribution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.30614","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Audio Pirates: Black-box Audio Watermark Removal via Diffusion Priors","primary_cat":"cs.CR","submitted_at":"2026-05-28T22:07:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"DiffErase removes black-box audio watermarks via diffusion priors by adding intermediate noise and regenerating with a pretrained model, preserving quality across audio domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20946","ref_index":26,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Thinking-while-speaking: A Controlled, Interleaved Reasoning Method for Real-Time Speech Generation","primary_cat":"cs.CL","submitted_at":"2026-05-20T09:32:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"InterRS enables real-time speech generation with interleaved reasoning via a controlled data pipeline, interleaved SFT, and RL using TA-Balance and Linguistic Quality rewards, yielding 13% gains on math and logic benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.21538","ref_index":7,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Academic Text-to-Music Grand Challenge: Datasets, Baselines, and Evaluation Methods","primary_cat":"cs.SD","submitted_at":"2026-05-20T07:18:24+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20519","ref_index":16,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Codec-Robust Attacks on Audio LLMs","primary_cat":"cs.SD","submitted_at":"2026-05-19T21:39:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"CodecAttack perturbs audio in codec latent space with multi-bitrate EoT to achieve 85.5% average ASR on Opus-compressed Audio LLMs versus under 26% for waveform baselines, with transfer to MP3 and AAC.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.17085","ref_index":32,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Taming Audio VAEs via Target-KL Regularization","primary_cat":"cs.SD","submitted_at":"2026-05-16T17:01:47+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"The paper introduces target-KL regularization to train audio VAEs at specific bitrates, enabling rate-distortion curves and comparison to discrete audio codecs for improved text-to-sound generation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15831","ref_index":2,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Modeling Music as a Time-Frequency Image: A 2D Tokenizer for Music Generation","primary_cat":"cs.SD","submitted_at":"2026-05-15T10:35:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"BandTok tokenizes Mel-spectrograms as independent time-frequency band tokens from a single codebook and pairs it with 2D RoPE in an autoregressive model to improve music generation over residual multi-codebook tokenizers.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.15418","ref_index":25,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A General Differentiable Ray-Wave Framework for Hybrid Refractive-Diffractive System Modeling and Optimization","primary_cat":"physics.optics","submitted_at":"2026-05-14T21:04:00+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A plug-and-play differentiable model bridging ray and wave optics for hybrid systems that enables end-to-end optimization of planar and conformal diffractive elements.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"; Gu, M. B.; Kivshar, Y. S.; Altug, H. Angle- multiplexed all-dielectric metasurfaces for broadband molecular fingerprint retrieval. Science Advances2019,5, eaaw2871. (24) Sell, D.; Yang, J.; Doshay, S.; Yang, R.; Fan, J. A. Large-Angle, Multifunctional Meta- gratings Based on Freeform Multimode Geometries.Nano Letters2017,17, 3752-3757, PMID: 28459583. (25) Jiang, J.; Fan, J. A. Multiobjective and categorical global optimization of photonic structures based on ResNet generative neural networks.Nanophotonics2021,10, 361- 369. (26) Chen, M.; Jiang, J.; Fan, J. A. Design Space Reparameterization Enforces Hard Geo- metric Constraints in Inverse-Designed Nanophotonic Devices.ACS Photonics2020, 7, 3141-3151."},{"citing_arxiv_id":"2605.13404","ref_index":8,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Seconds-Aligned PCA-DAC Latent Diffusion for Symbolic-to-Audio Drum Rendering","primary_cat":"cs.SD","submitted_at":"2026-05-13T11:59:41+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Sec2Drum-DAC renders drum audio from symbolic inputs via diffusion on PCA-reduced DAC latents, improving spectral and transient metrics over regression baselines on 1733 held-out windows.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"The present work adopts this latent-denoising perspective in the setting of explicitly conditioned drum rendering rather than unconstrained image synthesis. Codec-based audio generation.Recent audio generation systems increasingly rely on learned low-rate intermediate representations. SoundStream, EnCodec, and DAC establish neural-codec representations with residual vector quantization and high-fidelity decoding [8, 20, 33]. AudioLM and MusicLM showed that neural-codec representations can support broad audio and music generation [2, 4]. AudioLDM 2 demonstrates diffusion over learned audio representations for broad audio- generation tasks [21], while Stable Audio and long-form music diffusion show that timing-aware latent diffusion can scale to long-duration, high-resolution synthesis [10, 11]."},{"citing_arxiv_id":"2605.13248","ref_index":17,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Compact Latent Manifold Translation: A Parameter-Efficient Foundation Model for Cross-Modal and Cross-Frequency Physiological Signal Synthesis","primary_cat":"eess.SP","submitted_at":"2026-05-13T09:31:36+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A compact 0.09B model using hierarchical discrete tokenization and prompted latent translation outperforms larger baselines in cross-modal PPG-to-ECG synthesis and cross-frequency super-resolution.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.11098","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling","primary_cat":"cs.SD","submitted_at":"2026-05-11T18:04:33+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"AffectCodec is an emotion-guided neural speech codec that preserves emotional cues during quantization while maintaining semantic fidelity and prosodic naturalness.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"k=1 operating at multiple temporal and spectral resolutions is em- ployed, including multi-scale STFT, multi-period, and multi-scale waveform discriminators. The gen- erator is optimized to produce reconstructions that are indistinguishable from real speech. Its adver- sarial loss is defined using the hinge formulation as LG adv = 1 K KX k=1 max(0,1−D k(ˆx)),(12) where ˆx denotes the reconstructed waveform. The discriminators are trained to distinguish real speech x from generated samples ˆx. Their objective is given by LD adv = 1 K KX k=1 h max(0,1−D k(x)) + max(0,1 +D k(ˆx)) i . (13) This adversarial formulation encourages high- fidelity waveform reconstruction while stabilizing training across multiple discriminative views."},{"citing_arxiv_id":"2605.10281","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Drum Synthesis from Expressive Drum Grids via Neural Audio Codecs","primary_cat":"cs.SD","submitted_at":"2026-05-11T09:40:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A Transformer predicts tokens from neural audio codecs (EnCodec, DAC, X-Codec) to convert expressive drum grids into audio, trained and evaluated on the E-GMD dataset using objective metrics.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06765","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"VITA-QinYu: Expressive Spoken Language Model for Role-Playing and Singing","primary_cat":"cs.CL","submitted_at":"2026-05-07T17:59:56+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"VITA-QinYu is the first expressive end-to-end spoken language model supporting role-playing and singing alongside conversation, trained on 15.8K hours of data and outperforming prior models on expressiveness and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.06582","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"PairAlign: A Framework for Sequence Tokenization via Self-Alignment with Applications to Audio Tokenization","primary_cat":"cs.LG","submitted_at":"2026-05-07T17:11:22+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Positive predictability alone is not sufficient, because a collapsed tokenizer could make many inputs pre- dictable by assigning generic sequences. PairAlign therefore combines Eq. 12 with in-batch likelihood con- trast. For a conditioning representationZi, the paired teacher sequence should score higher than mismatched teacher sequences from other examples: ¯s(ˆT + i |Zi)>¯s(ˆT + j |Zi), j̸=i,(13) where¯sdenotes length-normalized conditional log-likelihood. Thus, PairAlign is not only a conditional sequence-generation objective; it is a discriminative self-alignment objective over learned symbolic sequences. Relation to wav2tok.wav2tok provides the closest prior connection between retrieval-oriented speech tokenization and transduction-style sequence likelihood (Banerjee & Arora, 2022)."},{"citing_arxiv_id":"2605.03937","ref_index":3,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MiniMind-O Technical Report: An Open Small-Scale Speech-Native Omni Model","primary_cat":"cs.SD","submitted_at":"2026-05-05T16:27:33+00:00","verdict":"ACCEPT","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"MiniMind-O delivers a working 0.1B-scale open omni model with speech-native output, Thinker-Talker split, frozen encoders, and full release of code, checkpoints, and training data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.26465","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Diffusion Reconstruction towards Generalizable Audio Deepfake Detection","primary_cat":"cs.SD","submitted_at":"2026-04-29T09:21:26+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Diffusion reconstruction creates hard samples for audio deepfake detection training, and when paired with feature aggregation and RACL, it reduces average EER versus baselines.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.23077","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Adopting State-of-the-Art Pretrained Audio Representations for Music Recommender Systems","primary_cat":"cs.IR","submitted_at":"2026-04-25T00:09:58+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Pretrained audio models show large performance gaps between standard MIR tasks and music recommendation in both hot and cold-start settings.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"MIR that will be used to compute audio representations and briefly describe each of them. We describe the recommendation models we used, outline training details1, and conclude with results and discussion. 2 BACKGROUND To start our discussion, we turn our attention to different ways to incorporate audio in MRS that have been used throughout the years. To this end, we take recent reviews on the topic [19, 54, 69] covering papers from 2006 to 2022, select all papers that use audio content and study the way it is processed. This funnel produced a total of 45 papers; the full list can be found in the appendix in Table 14. It is important to note that our overview is only as comprehensive as the reviews we selected, and we can expect some relevant papers to be missing, but we believe it to be a reasonable"},{"citing_arxiv_id":"2604.19949","ref_index":132,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Indic-CodecFake meets SATYAM: Towards Detecting Neural Audio Codec Synthesized Speech Deepfakes in Indic Languages","primary_cat":"eess.AS","submitted_at":"2026-04-21T19:54:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"Introduces the Indic-CodecFake dataset for Indic codec deepfakes and SATYAM, a novel hyperbolic ALM that outperforms baselines through dual-stage semantic-prosodic fusion using Bhattacharya distance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.19330","ref_index":30,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Text-To-Speech with Chain-of-Details: modeling temporal dynamics in speech generation","primary_cat":"eess.AS","submitted_at":"2026-04-21T10:58:48+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Chain-of-Details (CoD) is a cascaded TTS method that explicitly models temporal coarse-to-fine dynamics with a shared decoder, achieving competitive performance using significantly fewer parameters.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17852","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"LLM-Codec: Neural Audio Codec Meets Language Model Objectives","primary_cat":"cs.SD","submitted_at":"2026-04-20T06:02:25+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"LLM-Codec augments audio codec training with multi-step token prediction and contrastive semantic alignment to improve both waveform reconstruction and autoregressive predictability for speech language models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.17642","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HCFD: A Benchmark for Audio Deepfake Detection in Healthcare","primary_cat":"eess.AS","submitted_at":"2026-04-19T22:26:28+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HCFD is a new pathology-aware benchmark and dataset for codec-fake audio detection in healthcare, with PHOENIX-Mamba achieving up to 97% accuracy by modeling fakes as modes in hyperbolic space.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.11096","ref_index":1,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Efficient Training for Cross-lingual Speech Language Models","primary_cat":"cs.CL","submitted_at":"2026-04-13T07:12:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CSLM achieves cross-modal and cross-lingual alignment in speech LLMs via continual pre-training on discrete tokens and speech-text interleaved instruction tuning, enabling scalability without massive speech datasets.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09371","ref_index":20,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Discrete Token Modeling for Multi-Stem Music Source Separation with Language Models","primary_cat":"eess.AS","submitted_at":"2026-04-10T14:40:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A Conformer-conditioned decoder-only language model generates discrete tokens via a neural audio codec to separate four music stems, reaching near state-of-the-art perceptual quality and top NISQA on vocals in MUSDB18-HQ tests.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.09054","ref_index":10,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"HAFM: Hierarchical Autoregressive Foundation Model for Music Accompaniment Generation","primary_cat":"cs.SD","submitted_at":"2026-04-10T07:27:55+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.01929","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Woosh: A Sound Effects Foundation Model","primary_cat":"cs.SD","submitted_at":"2026-04-02T11:49:00+00:00","verdict":"ACCEPT","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Woosh is a new publicly released foundation model optimized for high-quality sound effect generation from text or video, showing competitive or better results than open alternatives like Stable Audio Open.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Demo samples for the encoder/decoder, T2A, and V2A modules can be found at https://sonyresearch.github.io/Woosh. 2 Woosh-AE: Audio Encoder/Decoder 2.1 Architecture The Woosh-AE module is based on the VOCOS architecture [11], a GAN-based vocoder operating on the domain of the short-time Fourier transform (STFT) complex coefficients. Compared to other popular en- coder/decoders like Encodec [12] or DAC [13], the VOCOS vocoder does not use quantization and it per- forms one-step down/up sampling via STFT/iSTFT efficiently. The iSTFT avoids aliasing artifacts, typically linked to transposed convolutions used in upsampling [14] in alternative approaches. Note that Woosh-AE works on monaural audio only. The VOCOS architecture uses a cascade of ConvNeXt blocks with residual connections both in the encoder"},{"citing_arxiv_id":"2604.01120","ref_index":21,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Diff-VS: Efficient Audio-Aware Diffusion U-Net for Vocals Separation","primary_cat":"eess.AS","submitted_at":"2026-04-01T16:44:19+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Diff-VS is an efficient audio-aware diffusion U-Net for vocal separation that matches discriminative baselines on objective metrics while achieving state-of-the-art perceptual quality via proxy measures.","context_count":1,"top_context_role":"method","top_context_polarity":"use_method","context_text":"training and inference are largely model-agnostic. In other words, the EDM framework is robust to architectural modi- fications, allowing us to incorporate music-specific enhance- ments on top of the DDPM++ model [19]. We will discuss these enhancements in Section 3.2. Second, EDM optimizes the sampling process to achieve higher quality with fewer steps. Is uses Heun's second-order method [21] as ODE solver together with a custom inference noise schedule: σi<N = \u0012 σmax 1 ρ + i N−1 \u0010 σmin 1/ρ −σ max 1/ρ \u0011\u0013ρ (1) σmax is set to 80 andσ min is set to 0.002. Since trun- cation errors nearσ min are more impactful,ρis adjusted be- tween 5 to 10 to concentrate (i.e. shorten) steps close toσmin. These choices allow us to reduce the number of sampling"},{"citing_arxiv_id":"2603.25551","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Voxtral TTS","primary_cat":"cs.AI","submitted_at":"2026-03-26T15:23:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Voxtral TTS produces expressive multilingual speech from 3-second reference audio with a hybrid autoregressive-plus-flow-matching architecture and a new VQ-FSQ tokenizer, achieving 68.4% win rate over ElevenLabs in human evaluations.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.03190","ref_index":46,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Expectation and Acoustic Neural Network Representations Enhance Music Identification from Brain Activity","primary_cat":"cs.AI","submitted_at":"2026-03-03T17:47:09+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Separating acoustic and expectation ANN representations as teacher targets improves EEG music identification beyond baselines and seed ensembles.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2603.02364","ref_index":18,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"When Spoof Detectors Travel: Evaluation Across 66 Languages in the Low-Resource Language Spoofing Corpus","primary_cat":"cs.SD","submitted_at":"2026-03-02T20:11:06+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"LRLspoof corpus and threshold-transfer evaluation demonstrate that spoof detection performance varies markedly across languages, identifying language as an independent domain shift factor.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2601.15621","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Qwen3-TTS Technical Report","primary_cat":"cs.SD","submitted_at":"2026-01-22T03:51:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Qwen3-TTS delivers state-of-the-art multilingual TTS performance with 3-second voice cloning, description control, and ultra-low-latency streaming via dual tokenizers and a dual-track LM architecture trained on over 5 million hours of data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.07571","ref_index":11,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"A Simple Method to Enhance Pre-trained Language Models with Speech Tokens for Classification","primary_cat":"cs.CL","submitted_at":"2025-12-08T14:05:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Lasso-selected speech tokens enhance text LLMs for multimodal classification by reducing long audio sequences to task-relevant features via self-supervised adaptation.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2512.01537","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Two-Dimensional Quantization for Geometry-Aware Audio Coding","primary_cat":"cs.SD","submitted_at":"2025-12-01T11:06:38+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Q2D2 uses 2D geometric grid projections to quantize feature pairs in neural audio codecs, yielding implicit codebooks that improve efficiency and utilization over RVQ, VQ, and FSQ while maintaining reconstruction quality.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2511.21577","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"HarmonicAttack: An Adaptive Cross-Domain Audio Watermark Removal","primary_cat":"cs.SD","submitted_at":"2025-11-26T16:51:20+00:00","verdict":"CONDITIONAL","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A black-box audio watermark removal attack trained on limited samples that generalizes across datasets and watermark schemes with high attack success rates.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2510.19127","ref_index":3,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Steering Autoregressive Music Generation with Recursive Feature Machines","primary_cat":"cs.LG","submitted_at":"2025-10-21T23:23:14+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"MusicRFM discovers interpretable concept directions in music model hidden states using RFM probes and injects them at inference to steer generation toward desired musical properties without retraining.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2509.11717","ref_index":12,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"CodecSep: Prompt-Driven Universal Sound Separation on Neural Audio Codec Latents","primary_cat":"cs.SD","submitted_at":"2025-09-15T09:12:57+00:00","verdict":null,"verdict_confidence":null,"novelty_score":null,"formal_verification":null,"one_line_summary":null,"context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16632","ref_index":15,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Step-Audio 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2505.00409","ref_index":94,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Perceptual implications of automatic anonymization in pathological speech","primary_cat":"eess.AS","submitted_at":"2025-05-01T09:03:03+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Listeners detect automatic anonymization in pathological speech at 91-93% accuracy with a 30-point perceived quality drop, yet clinical severity ratings stay nearly unchanged for dysarthria, dysglossia, and dysphonia.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.05139","ref_index":60,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound","primary_cat":"cs.SD","submitted_at":"2025-02-07T18:15:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Unified no-reference models assess audio aesthetics across speech, music, and sound via four perceptual axes and achieve performance comparable or superior to human mean opinion scores.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2502.04230","ref_index":19,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"XAttnMark: Learning Robust Audio Watermarking with Cross-Attention","primary_cat":"cs.SD","submitted_at":"2025-02-06T17:15:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"XAttnMark is a new neural audio watermarking method using partial parameter sharing, cross-attention for message retrieval, temporal conditioning, and a psychoacoustic TF masking loss that reports state-of-the-art detection and attribution robustness.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2411.19650","ref_index":19,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","primary_cat":"cs.RO","submitted_at":"2024-11-29T12:06:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CogACT is a new VLA model that uses a conditioned diffusion action transformer to achieve over 35% higher average success rates than OpenVLA in simulation and 55% in real-robot experiments while generalizing to new robots and objects.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"diction in simple ways, leading to several issues that hin- der task performance. For instance, works like [8, 30] di- rectly quantize the continuous spectrum of robot actions into discrete bins in accordance to the next token predic- tion scheme of VLMs. However, such a simple quantiza- tion, unlike sophisticated tokenizers such as those designed for images [65, 72] and audio [19, 73], poses difficulties in action learning and limits action precision. [32] intro- duces additional action heads, such as LSTMs, to transform VLM output into actions. The shift to a regression-based learning scheme, however, overlooks the probabilistic and multimodal1 nature of actions. In this paper, we propose a new VLA model architec- ture derived from VLM."},{"citing_arxiv_id":"2411.17690","ref_index":14,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Mechanisms of Multimodal Synchronization: Insights from Decoder-Based Video-Text-to-Speech Synthesis","primary_cat":"cs.MM","submitted_at":"2024-11-26T18:57:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Experiments with a video-text-to-speech transformer show co-temporal positional indexing enables synchronization without timestamps, text and video supply complementary signals, and modality ordering creates a trade-off between in-domain accuracy and cross-domain generalization.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2410.13720","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Movie Gen: A Cast of Media Foundation Models","primary_cat":"cs.CV","submitted_at":"2024-10-17T16:22:46+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A 30B-parameter transformer and related models generate high-quality videos and audio, claiming state-of-the-art results on text-to-video, video editing, personalization, and audio generation tasks.","context_count":1,"top_context_role":"other","top_context_polarity":"unclear","context_text":"second segment 0 5 10 15 20 25 Frame index 0.0 0.2 0.4 0.6 0.8 1.0Padded soft-mask first segment second segment Figure 29 Comparing uniform weighting (top) and triangle window based weighting (bottom) for multi-diffusion merging with an example oflwin = 20 and lctx = 5. The left column shows the unnormalized weights contributed from the first segment ([0, 15]) and the second ([10, 30]), and the right column shows the normalized weights. With triangle window, the transition in the overlapping region ([10, 15]) is much smoother. used in segment-level autoregressive generation is applied to the overlapping frames (nctx frames), while the soft-masking functions {m(j)}j are defined for all the frames each segment spans."},{"citing_arxiv_id":"2410.06885","ref_index":88,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching","primary_cat":"eess.AS","submitted_at":"2024-10-09T13:46:34+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"F5-TTS generates natural speech from text via flow matching on DiT with simple text padding, ConvNeXt refinement, and sway sampling, trained on 100K hours multilingual data.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2409.18512","ref_index":6,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Expressive Prompting: Improving Emotion Intensity and Speaker Consistency in Zero-Shot TTS","primary_cat":"cs.SD","submitted_at":"2024-09-27T07:46:52+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":3.0,"formal_verification":"none","one_line_summary":"A two-stage static-then-dynamic prompt selection strategy using prosodic features, LLM coherence scores, and similarity metrics improves emotion intensity and speaker consistency in zero-shot TTS.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2404.19737","ref_index":4,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Better & Faster Large Language Models via Multi-token Prediction","primary_cat":"cs.CL","submitted_at":"2024-04-30T17:33:57+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Multi-token prediction training yields higher sample efficiency, better benchmark scores on code generation, and up to 3x faster inference than standard next-token prediction for LLMs.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2403.11247","ref_index":35,"ref_count":1,"confidence":0.98,"is_internal_anchor":true,"paper_title":"Compact 3D Gaussian Splatting For Dense Visual SLAM","primary_cat":"cs.CV","submitted_at":"2024-03-17T15:41:35+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"A compact 3D Gaussian Splatting SLAM system reduces Gaussian count and parameter size via masking and a geometry codebook while preserving SOTA reconstruction quality and pose accuracy.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2311.07919","ref_index":9,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models","primary_cat":"eess.AS","submitted_at":"2023-11-14T05:34:50+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Qwen-Audio trains a unified model on diverse audio and tasks with hierarchical tags to enable strong zero-shot performance on audio understanding benchmarks and multi-turn audio chat.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2310.05737","ref_index":105,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation","primary_cat":"cs.CV","submitted_at":"2023-10-09T14:10:29+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"A new shared video-image tokenizer enables large language models to surpass diffusion models on standard visual generation benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2306.12925","ref_index":18,"ref_count":2,"confidence":0.98,"is_internal_anchor":true,"paper_title":"AudioPaLM: A Large Language Model That Can Speak and Listen","primary_cat":"cs.CL","submitted_at":"2023-06-22T14:37:54+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"AudioPaLM unifies PaLM-2 and AudioLM to outperform prior systems on speech translation while enabling zero-shot speech-to-text for many unseen language pairs and voice transfer from short prompts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2301.11325","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":true,"paper_title":"MusicLM: Generating Music From Text","primary_cat":"cs.SD","submitted_at":"2023-01-26T18:58:53+00:00","verdict":"CONDITIONAL","verdict_confidence":"MODERATE","novelty_score":8.0,"formal_verification":"none","one_line_summary":"MusicLM produces coherent multi-minute 24 kHz music from text prompts using hierarchical sequence-to-sequence modeling and outperforms prior systems in quality and text adherence.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}