{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:C2GQ777BR6S5ZGOGYLD3N4NGXE","short_pith_number":"pith:C2GQ777B","schema_version":"1.0","canonical_sha256":"168d0fffe18fa5dc99c6c2c7b6f1a6b90cdc30d51a9d3ba7487fbfdfe6f1131f","source":{"kind":"arxiv","id":"2412.02612","version":1},"attestation_state":"computed","paper":{"title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot","license":"http://creativecommons.org/licenses/by/4.0/","headline":"GLM-4-Voice turns a text language model into an end-to-end spoken chatbot that reaches state-of-the-art results in speech language modeling and spoken question answering.","cross_cats":["cs.SD","eess.AS"],"primary_cat":"cs.CL","authors_text":"Aohan Zeng, Jie Tang, Kedong Wang, Lei Zhao, Mingdao Liu, Shengmin Jiang, Yuxiao Dong, Zhengxiao Du","submitted_at":"2024-12-03T17:41:24Z","abstract_excerpt":"We introduce GLM-4-Voice, an intelligent and human-like end-to-end spoken chatbot. It supports both Chinese and English, engages in real-time voice conversations, and varies vocal nuances such as emotion, intonation, speech rate, and dialect according to user instructions. GLM-4-Voice uses an ultra-low bitrate (175bps), single-codebook speech tokenizer with 12.5Hz frame rate derived from an automatic speech recognition (ASR) model by incorporating a vector-quantized bottleneck into the encoder. To efficiently transfer knowledge from text to speech modalities, we synthesize speech-text interlea"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2412.02612","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-12-03T17:41:24Z","cross_cats_sorted":["cs.SD","eess.AS"],"title_canon_sha256":"7541d60a6b93d682e37a502e975d122105084c3539b4b7ab4cd320904b751813","abstract_canon_sha256":"06a88367091efb37490e4ba14e5336a532d84d97a04058097c3a8edf47df8ebd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.178087Z","signature_b64":"jDPk4lHwX0CRHVif9+ia1gcVzdxonbHs99VdyeN001MOAesxJdpoIbkPEkmcms23tjg5hVvKFf2mw3jt2scyBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"168d0fffe18fa5dc99c6c2c7b6f1a6b90cdc30d51a9d3ba7487fbfdfe6f1131f","last_reissued_at":"2026-05-17T23:38:49.177452Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.177452Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot","license":"http://creativecommons.org/licenses/by/4.0/","headline":"GLM-4-Voice turns a text language model into an end-to-end spoken chatbot that reaches state-of-the-art results in speech language modeling and spoken question answering.","cross_cats":["cs.SD","eess.AS"],"primary_cat":"cs.CL","authors_text":"Aohan Zeng, Jie Tang, Kedong Wang, Lei Zhao, Mingdao Liu, Shengmin Jiang, Yuxiao Dong, Zhengxiao Du","submitted_at":"2024-12-03T17:41:24Z","abstract_excerpt":"We introduce GLM-4-Voice, an intelligent and human-like end-to-end spoken chatbot. It supports both Chinese and English, engages in real-time voice conversations, and varies vocal nuances such as emotion, intonation, speech rate, and dialect according to user instructions. GLM-4-Voice uses an ultra-low bitrate (175bps), single-codebook speech tokenizer with 12.5Hz frame rate derived from an automatic speech recognition (ASR) model by incorporating a vector-quantized bottleneck into the encoder. To efficiently transfer knowledge from text to speech modalities, we synthesize speech-text interlea"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"achieving state-of-the-art performance in both speech language modeling and spoken question answering. We then fine-tune the pre-trained model with high-quality conversational speech data, achieving superior performance compared to existing baselines in both conversational ability and speech quality.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The synthesized speech-text interleaved data and the ultra-low-bitrate tokenizer preserve sufficient information for nuanced vocal control and accurate spoken question answering without introducing systematic artifacts or information loss that would undermine the claimed gains.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GLM-4-Voice builds an end-to-end spoken chatbot by deriving a 175bps single-codebook tokenizer from ASR, synthesizing interleaved speech-text data, and continuing pre-training of GLM-4-9B on up to 1 trillion tokens before fine-tuning on conversational speech.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"GLM-4-Voice turns a text language model into an end-to-end spoken chatbot that reaches state-of-the-art results in speech language modeling and spoken question answering.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"14d4dc893cf5ff95587304523b4c8dbe3ace662cd1ec4076bfdcd9519283f291"},"source":{"id":"2412.02612","kind":"arxiv","version":1},"verdict":{"id":"9e2aa5b6-d391-4f5e-ac77-281600d81335","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T03:48:50.845924Z","strongest_claim":"achieving state-of-the-art performance in both speech language modeling and spoken question answering. We then fine-tune the pre-trained model with high-quality conversational speech data, achieving superior performance compared to existing baselines in both conversational ability and speech quality.","one_line_summary":"GLM-4-Voice builds an end-to-end spoken chatbot by deriving a 175bps single-codebook tokenizer from ASR, synthesizing interleaved speech-text data, and continuing pre-training of GLM-4-9B on up to 1 trillion tokens before fine-tuning on conversational speech.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The synthesized speech-text interleaved data and the ultra-low-bitrate tokenizer preserve sufficient information for nuanced vocal control and accurate spoken question answering without introducing systematic artifacts or information loss that would undermine the claimed gains.","pith_extraction_headline":"GLM-4-Voice turns a text language model into an end-to-end spoken chatbot that reaches state-of-the-art results in speech language modeling and spoken question answering."},"references":{"count":50,"sample":[{"doi":"","year":null,"title":"Funaudiollm: V oice understanding and generation foundation models for natural interaction between humans and llms","work_id":"7cd6d289-dca2-414f-99e0-809f37c065fa","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv.2407.04051","year":null,"title":"Funaudiollm: V oice understanding and generation foundation models for natural interaction between humans and llms","work_id":"7cd6d289-dca2-414f-99e0-809f37c065fa","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Speecht5: Unified-modal encoder-decoder pre-training for spoken language processing","work_id":"d6d2d38d-a03a-44a0-acd7-84cdd7540abe","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Tyers, and Gregor Weber","work_id":"b2ecb06c-4d8f-461c-b4c1-e2df14d3b130","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2013,"title":"Semantic parsing on freebase from question-answer pairs","work_id":"279e4368-a78d-459d-81a5-fabb138741b9","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":50,"snapshot_sha256":"f8c5f310e103d0bc88b6e5d7391096a54a1b20f5381ac57434b4cd2cd038a600","internal_anchors":8},"formal_canon":{"evidence_count":2,"snapshot_sha256":"04db6433a248afd6506bba1ffeff15dcae198aa070a6cd9704519466a78e3adc"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2412.02612","created_at":"2026-05-17T23:38:49.177563+00:00"},{"alias_kind":"arxiv_version","alias_value":"2412.02612v1","created_at":"2026-05-17T23:38:49.177563+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2412.02612","created_at":"2026-05-17T23:38:49.177563+00:00"},{"alias_kind":"pith_short_12","alias_value":"C2GQ777BR6S5","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"C2GQ777BR6S5ZGOG","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"C2GQ777B","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":32,"internal_anchor_count":32,"sample":[{"citing_arxiv_id":"2504.08528","citing_title":"On The Landscape of Spoken Language Models: A Comprehensive Survey","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2603.17837","citing_title":"The Silent Thought: Modeling Internal Cognition in Full-Duplex Spoken Dialogue Models via Latent Reasoning","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20266","citing_title":"A Survey of Large Audio Language Models: Generalization, Trustworthiness, and Outlook","ref_index":124,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20755","citing_title":"DuplexSLA: A Full-Duplex Spoken Language Model with Synchronized Speech, Language, and Action","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21008","citing_title":"A Survey of Audio Reasoning in Multimodal Foundation Models","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2509.14804","citing_title":"Towards Building Speech Large Language Models for Multitask Understanding in Low-Resource Languages","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2502.11946","citing_title":"Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2509.23435","citing_title":"AudioRole: An Audio Dataset for Character Role-Playing in Large Language Models","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2509.22220","citing_title":"StableToken: A Noise-Robust Semantic Speech Tokenizer for Resilient SpeechLLMs","ref_index":84,"is_internal_anchor":true},{"citing_arxiv_id":"2510.09592","citing_title":"Mind-Paced Speaking: A Dual-Brain Approach to Real-Time Reasoning in Spoken Language Models","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2512.14234","citing_title":"ViBES: A Conversational Agent with Behaviorally-Intelligent 3D Virtual Body","ref_index":129,"is_internal_anchor":true},{"citing_arxiv_id":"2512.23578","citing_title":"Style Amnesia: Investigating Speaking Style Degradation and Mitigation in Multi-Turn Spoken Language Models","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2507.16632","citing_title":"Step-Audio 2 Technical Report","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2603.17837","citing_title":"The Silent Thought: Modeling Internal Cognition in Full-Duplex Spoken Dialogue Models via Latent Reasoning","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22267","citing_title":"TiCo: Time-Controllable Spoken Dialogue Model","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2604.01897","citing_title":"FastTurn: Unifying Acoustic and Streaming Semantic Cues for Low-Latency and Robust Turn Detection","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11098","citing_title":"AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10199","citing_title":"How Should LLMs Listen While Speaking? A Study of User-Stream Routing in Full-Duplex Spoken Dialogue","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2503.01743","citing_title":"Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05927","citing_title":"Minimizing Modality Gap from the Input Side: Your Speech LLM Can Be a Prosody-Aware Text LLM","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2504.18425","citing_title":"Kimi-Audio Technical Report","ref_index":84,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21406","citing_title":"Full-Duplex Interaction in Spoken Dialogue Systems: A Comprehensive Study from the ICASSP 2026 HumDial Challenge","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00329","citing_title":"Fast Text-to-Audio Generation with One-Step Sampling via Energy-Scoring and Auxiliary Contextual Representation Distillation","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18489","citing_title":"Aligning Language Models for Lyric-to-Melody Generation with Rule-Based Musical Constraints","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11594","citing_title":"HumDial-EIBench: A Human-Recorded Multi-Turn Emotional Intelligence Benchmark for Audio Language Models","ref_index":34,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE","json":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE.json","graph_json":"https://pith.science/api/pith-number/C2GQ777BR6S5ZGOGYLD3N4NGXE/graph.json","events_json":"https://pith.science/api/pith-number/C2GQ777BR6S5ZGOGYLD3N4NGXE/events.json","paper":"https://pith.science/paper/C2GQ777B"},"agent_actions":{"view_html":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE","download_json":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE.json","view_paper":"https://pith.science/paper/C2GQ777B","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2412.02612&json=true","fetch_graph":"https://pith.science/api/pith-number/C2GQ777BR6S5ZGOGYLD3N4NGXE/graph.json","fetch_events":"https://pith.science/api/pith-number/C2GQ777BR6S5ZGOGYLD3N4NGXE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE/action/storage_attestation","attest_author":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE/action/author_attestation","sign_citation":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE/action/citation_signature","submit_replication":"https://pith.science/pith/C2GQ777BR6S5ZGOGYLD3N4NGXE/action/replication_record"}},"created_at":"2026-05-17T23:38:49.177563+00:00","updated_at":"2026-05-17T23:38:49.177563+00:00"}