{"work":{"id":"2e4e260a-a952-42ae-9dd6-2de2d3127881","openalex_id":null,"doi":null,"arxiv_id":"2410.17196","raw_key":null,"title":"VoiceBench: Benchmarking LLM-Based Voice Assistants","authors":null,"authors_text":"Yiming Chen, Xianghu Yue, Chen Zhang, Xiaoxue Gao, Robby T. Tan, Haizhou Li","year":2024,"venue":"cs.CL","abstract":"Building on the success of large language models (LLMs), recent advancements such as GPT-4o have enabled real-time speech interactions through LLM-based voice assistants, offering a significantly improved user experience compared to traditional text-based interactions. However, the absence of benchmarks designed to evaluate these speech interaction capabilities has hindered progress of LLM-based voice assistants development. Current evaluations focus primarily on automatic speech recognition (ASR) or general knowledge evaluation with clean speeches, neglecting the more intricate, real-world scenarios that involve diverse speaker characteristics, environmental and content factors. To address this, we introduce VoiceBench, the first benchmark designed to provide a multi-faceted evaluation of LLM-based voice assistants. VoiceBench also includes both real and synthetic spoken instructions that incorporate the above three key real-world variations. Extensive experiments reveal the limitations of current LLM-based voice assistant models and offer valuable insights for future research and development in this field.","external_url":"https://arxiv.org/abs/2410.17196","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-28T18:02:27.076519+00:00","pith_arxiv_id":"2410.17196","created_at":"2026-05-10T00:39:48.471464+00:00","updated_at":"2026-06-28T18:02:27.076519+00:00","title_quality_ok":true,"display_title":"VoiceBench: Benchmarking LLM-Based Voice Assistants","render_title":"VoiceBench: Benchmarking LLM-Based Voice Assistants"},"hub":{"state":{"work_id":"2e4e260a-a952-42ae-9dd6-2de2d3127881","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":25,"external_cited_by_count":null,"distinct_field_count":7,"first_pith_cited_at":"2025-03-26T04:17:55+00:00","last_pith_cited_at":"2026-05-31T05:13:32+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T14:08:58.249128+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":7},{"context_role":"dataset","n":3},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":6},{"context_polarity":"use_dataset","n":3},{"context_polarity":"unclear","n":1},{"context_polarity":"use_method","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}