{"total":16,"items":[{"citing_arxiv_id":"2606.30682","ref_index":25,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"ALM2Vec: Learning Audio Embeddings for Universal Audio Retrieval with Large Audio-Language Models","primary_cat":"cs.SD","submitted_at":"2026-06-27T03:56:57+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"ALM2Vec learns unified audio embeddings from large audio-language models for text-audio retrieval, instruction-aware retrieval, and other tasks across domains.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20101","ref_index":52,"ref_count":2,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Hybrid Diffusion Transformer for Instruction-Guided Audio Editing via Rectified Flow","primary_cat":"cs.SD","submitted_at":"2026-06-18T11:20:08+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Hybrid two-stage diffusion transformer architecture for instruction-guided audio editing via rectified flow that performs joint attention at low resolution then alternates joint and cross-attention at high resolution for improved performance and efficiency.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.20077","ref_index":16,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"The Hidden Evolution of Disguised Visual Context inside the VLM","primary_cat":"cs.CV","submitted_at":"2026-06-18T10:52:49+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"Visual tokens enter VLMs as raw signals and are reshaped differently by in-context versus layer-injection paradigms, each capturing distinct frequency characteristics that drive task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.11260","ref_index":7,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"RAIL: Rethinking Auditory Intelligence in Large Audio-Language Models with a CHC-Grounded Benchmark","primary_cat":"cs.SD","submitted_at":"2026-06-09T02:38:17+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Introduces RAIL, a CHC-grounded benchmark with five core auditory capabilities to assess LALMs beyond task-centric metrics, showing uneven model performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.08425","ref_index":34,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TinyGiantALM: A Compact Audio-Language Model for Intent-Aware Reasoning under Resource Constraints","primary_cat":"cs.SD","submitted_at":"2026-06-07T02:50:24+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"TinyGiantALM, a compact 1.5B audio-language model with instruction-aware refinement, achieves 46.4% zero-shot accuracy on MMAR and outperforms models up to 8x larger in mixed-modality tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.18273","ref_index":15,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Continuous Audio Thinking for Large Audio Language Models","primary_cat":"cs.CL","submitted_at":"2026-06-05T11:38:30+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"CoAT adds a continuous latent thinking space to LALMs via expert distillation to retain acoustic information, yielding gains on audio reasoning, understanding, music, emotion, and transcription benchmarks across three models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2606.05121","ref_index":13,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Audio Interaction Model","primary_cat":"cs.SD","submitted_at":"2026-06-03T17:26:11+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Audio-Interaction unifies offline and online audio tasks into one streaming model via the SoundFlow framework and a new 2.6M-item streaming corpus, enabling real-time instruction following and proactive responses.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.27741","ref_index":4,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Escape the Language Prior: Mitigating Late-Stage Modality Collapse in Audio Reasoning via Modality-Aware Policy Optimization","primary_cat":"cs.CL","submitted_at":"2026-05-26T22:34:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"MAPO is a dual-branch RL framework using modality relevance masks from cross-modal differential entropy and auxiliary attention losses to reduce late-stage modality collapse in audio reasoning models and improve benchmark results.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.20266","ref_index":129,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"A Survey of Large Audio Language Models: Generalization, Trustworthiness, and Outlook","primary_cat":"cs.SD","submitted_at":"2026-05-18T20:21:32+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":5.0,"formal_verification":"none","one_line_summary":"A survey of Large Audio Language Models that establishes a taxonomy of trustworthiness vulnerabilities and proposes a Defense-in-Depth roadmap for audio intelligence.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"Contin. -✗✓ ✓ Year 2025 MinMo [60] Jan 2025 Qwen2.5-7B-Instruct 7B Multi. Contin. -✓ ✓ ✓ FireRedASR [12] Jan 2025 Qwen2-7B-Instruct 7B Multi. Contin. -✗✓ ✓ Step-Audio [127] Feb 2025 Step-1 130B Multi. Discrete 3.3T tokens✗✓ ✓ Baichuan-Audio [128] Feb 2025 Baichuan-Audio-Base 7B EN, CN Discrete 887K Hrs audio + 100B tokens✗✓ ✓ Audio Flamingo 2 [129] Mar 2025 Qwen2.5-3B 3B EN Contin. 8M+ audio-caption pairs✗✓ ✓ Kimi-Audio [130] Apr 2025 Qwen2.5-7B 7B EN, CN Hybrid 13M+ Hrs audio✗✓ ✓ VITA-Audio [131] May 2025 Qwen2.5-7B-Instruct 7B EN, CN Discrete 200K Hrs audio✗✓ ✓ Step-Audio 2 [19] Jul 2025 - - Multi. Contin. 680B tokens and 8M Hrs audio✗✓ ✓ Audio Flamingo 3 [132] Jul 2025 Qwen2.5-7B 7B EN Contin."},{"citing_arxiv_id":"2606.11219","ref_index":74,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Afrispeech Semantics: Evaluating Audio Semantic Reasoning in Spoken Language Models Across Domains and Accents","primary_cat":"cs.CL","submitted_at":"2026-05-11T20:27:40+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"Audio language models are benchmarked on five semantic and paralinguistic reasoning tasks to reveal limitations in handling spoken audio evidence, accent variation, and domain shifts.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2605.07593","ref_index":44,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"TraceAV-Bench: Benchmarking Multi-Hop Trajectory Reasoning over Long Audio-Visual Videos","primary_cat":"cs.CV","submitted_at":"2026-05-08T11:06:43+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":8.0,"formal_verification":"none","one_line_summary":"TraceAV-Bench is the first benchmark for multi-hop trajectory reasoning over long audio-visual videos, showing top models reach only 51-68% accuracy with substantial room for improvement.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[42] Liang-Hsuan Tseng, Yi-Chang Chen, Kuan-Yi Lee, Da-Shan Shiu, and Hung-yi Lee. Taste: Text-aligned speech tokenization and embedding for spoken language modeling.arXiv preprint arXiv:2504.07053, 2025. [43] Ding Ding, Zeqian Ju, Yichong Leng, Songxiang Liu, Tong Liu, Zeyu Shang, Kai Shen, Wei Song, Xu Tan, Heyi Tang, et al. Kimi-audio technical report.arXiv preprint arXiv:2504.18425, 2025. [44] Sreyan Ghosh, Zhifeng Kong, Sonal Kumar, S Sakshi, Jaehyeon Kim, Wei Ping, Rafael Valle, Dinesh Manocha, and Bryan Catanzaro. Audio flamingo 2: An audio-language model with long-audio understanding and expert reasoning abilities.arXiv preprint arXiv:2503.03983, 2025. [45] Yunfei Chu, Jin Xu, Qian Yang, Haojie Wei, Xipin Wei, Zhifang Guo, Yichong Leng, Yuan-"},{"citing_arxiv_id":"2604.23717","ref_index":8,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"HeadRouter: Dynamic Head-Weight Routing for Task-Adaptive Audio Token Pruning in Large Audio Language Models","primary_cat":"cs.SD","submitted_at":"2026-04-26T14:00:31+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"HeadRouter prunes audio tokens more effectively by dynamically routing based on per-head importance for semantic versus acoustic tasks, exceeding baseline performance at 70% token retention on Qwen2.5-Omni models.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2604.13804","ref_index":14,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning","primary_cat":"cs.LG","submitted_at":"2026-04-15T12:39:03+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":7.0,"formal_verification":"none","one_line_summary":"RoleJudge is a multidimensional evaluation framework for speech-character alignment in audio LLMs, backed by the RoleChat dataset and multi-stage RL training with standard alignment to reduce reward issues.","context_count":1,"top_context_role":"background","top_context_polarity":"background","context_text":"[13] Qiming Feng, Qiujie Xie, Xiaolong Wang, Qingqiu Li, Yuejie Zhang, Rui Feng, Tao Zhang, and Shang Gao. 2025. EmoCharacter: Evaluating the Emotional Fidelity of Role-Playing Agents in Dialogues. InProceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers). 6218-6240. [14] Sreyan Ghosh, Zhifeng Kong, Sonal Kumar, S Sakshi, Jaehyeon Kim, Wei Ping, Rafael Valle, Dinesh Manocha, and Bryan Catanzaro. 2025. Audio Flamingo 2: An audio-language model with long-audio understanding and expert reasoning abilities.arXiv preprint arXiv:2503.03983(2025). [15] Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin"},{"citing_arxiv_id":"2604.13023","ref_index":12,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"SpotSound: Enhancing Large Audio-Language Models with Fine-Grained Temporal Grounding","primary_cat":"cs.SD","submitted_at":"2026-04-14T17:57:01+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"SpotSound adds a hallucination-suppressing objective and a needle-in-haystack benchmark to audio-language models, reaching state-of-the-art temporal grounding while keeping general task performance.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2602.07064","ref_index":62,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"OmniFysics: Towards Physical Intelligence Evolution via Omni-Modal Signal Processing and Network Optimization","primary_cat":"cs.CV","submitted_at":"2026-02-05T14:04:51+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":4.0,"formal_verification":"none","one_line_summary":"OmniFysics is an omni-modal network using a dynamic physical data engine and evolutive tuning to improve performance on multimodal benchmarks and physics-oriented tasks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null},{"citing_arxiv_id":"2507.16632","ref_index":24,"ref_count":1,"confidence":0.9,"is_internal_anchor":false,"paper_title":"Step-Audio 2 Technical Report","primary_cat":"cs.CL","submitted_at":"2025-07-22T14:23:55+00:00","verdict":"UNVERDICTED","verdict_confidence":"LOW","novelty_score":6.0,"formal_verification":"none","one_line_summary":"Step-Audio 2 integrates a latent audio encoder, reasoning-centric reinforcement learning, and discrete audio token generation into language modeling to deliver state-of-the-art performance on audio understanding and conversational benchmarks.","context_count":0,"top_context_role":null,"top_context_polarity":null,"context_text":null}],"limit":50,"offset":0}