{"work":{"id":"1d250ff4-6ca5-4eb5-b561-48106b630d8b","openalex_id":null,"doi":null,"arxiv_id":"2412.02612","raw_key":null,"title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot","authors":null,"authors_text":"Aohan Zeng, Zhengxiao Du, Mingdao Liu, Kedong Wang, Shengmin Jiang, Lei Zhao","year":2024,"venue":"cs.CL","abstract":"We introduce GLM-4-Voice, an intelligent and human-like end-to-end spoken chatbot. It supports both Chinese and English, engages in real-time voice conversations, and varies vocal nuances such as emotion, intonation, speech rate, and dialect according to user instructions. GLM-4-Voice uses an ultra-low bitrate (175bps), single-codebook speech tokenizer with 12.5Hz frame rate derived from an automatic speech recognition (ASR) model by incorporating a vector-quantized bottleneck into the encoder. To efficiently transfer knowledge from text to speech modalities, we synthesize speech-text interleaved data from existing text pre-training corpora using a text-to-token model. We continue pre-training from the pre-trained text language model GLM-4-9B with a combination of unsupervised speech data, interleaved speech-text data, and supervised speech-text data, scaling up to 1 trillion tokens, achieving state-of-the-art performance in both speech language modeling and spoken question answering. We then fine-tune the pre-trained model with high-quality conversational speech data, achieving superior performance compared to existing baselines in both conversational ability and speech quality. The open models can be accessed through https://github.com/THUDM/GLM-4-Voice and https://huggingface.co/THUDM/glm-4-voice-9b.","external_url":"https://arxiv.org/abs/2412.02612","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-28T19:22:34.949168+00:00","pith_arxiv_id":"2412.02612","created_at":"2026-05-10T00:39:48.453882+00:00","updated_at":"2026-06-28T19:22:34.949168+00:00","title_quality_ok":true,"display_title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot","render_title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot"},"hub":{"state":{"work_id":"1d250ff4-6ca5-4eb5-b561-48106b630d8b","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":32,"external_cited_by_count":null,"distinct_field_count":6,"first_pith_cited_at":"2025-02-17T15:58:56+00:00","last_pith_cited_at":"2026-05-30T18:53:35+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T12:38:47.324654+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":9},{"context_role":"method","n":2},{"context_role":"baseline","n":1}],"polarity_counts":[{"context_polarity":"background","n":9},{"context_polarity":"use_method","n":2},{"context_polarity":"baseline","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}