{"work":{"id":"9c2ba56b-5585-4f28-b751-703f31dca2d5","openalex_id":null,"doi":null,"arxiv_id":"2504.18425","raw_key":null,"title":"Kimi-Audio Technical Report","authors":null,"authors_text":"KimiTeam, Ding Ding, Zeqian Ju, Yichong Leng, Songxiang Liu, Tong Liu","year":2025,"venue":"eess.AS","abstract":"We present Kimi-Audio, an open-source audio foundation model that excels in audio understanding, generation, and conversation. We detail the practices in building Kimi-Audio, including model architecture, data curation, training recipe, inference deployment, and evaluation. Specifically, we leverage a 12.5Hz audio tokenizer, design a novel LLM-based architecture with continuous features as input and discrete tokens as output, and develop a chunk-wise streaming detokenizer based on flow matching. We curate a pre-training dataset that consists of more than 13 million hours of audio data covering a wide range of modalities including speech, sound, and music, and build a pipeline to construct high-quality and diverse post-training data. Initialized from a pre-trained LLM, Kimi-Audio is continual pre-trained on both audio and text data with several carefully designed tasks, and then fine-tuned to support a diverse of audio-related tasks. Extensive evaluation shows that Kimi-Audio achieves state-of-the-art performance on a range of audio benchmarks including speech recognition, audio understanding, audio question answering, and speech conversation. We release the codes, model checkpoints, as well as the evaluation toolkits in https://github.com/MoonshotAI/Kimi-Audio.","external_url":"https://arxiv.org/abs/2504.18425","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-21T19:15:30.958937+00:00","pith_arxiv_id":"2504.18425","created_at":"2026-05-10T00:39:48.484525+00:00","updated_at":"2026-05-21T19:15:30.958937+00:00","title_quality_ok":false,"display_title":"Kimi-Audio Technical Report","render_title":"Kimi-Audio Technical Report"},"hub":{"state":{"work_id":"9c2ba56b-5585-4f28-b751-703f31dca2d5","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":43,"external_cited_by_count":null,"distinct_field_count":7,"first_pith_cited_at":"2025-05-22T17:23:26+00:00","last_pith_cited_at":"2026-05-20T10:44:56+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-02T13:24:44.209995+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":10},{"context_role":"baseline","n":4}],"polarity_counts":[{"context_polarity":"background","n":10},{"context_polarity":"baseline","n":4}],"runs":{},"summary":{},"graph":{},"authors":[]}}