{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2020:J5I65L4WPTBH42L6YKTNHPGHTJ","short_pith_number":"pith:J5I65L4W","schema_version":"1.0","canonical_sha256":"4f51eeaf967cc27e697ec2a6d3bcc79a421a11ec618685ddd3eb47e9390ebbe1","source":{"kind":"arxiv","id":"2012.03411","version":2},"attestation_state":"computed","paper":{"title":"MLS: A Large-Scale Multilingual Dataset for Speech Research","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.SD"],"primary_cat":"eess.AS","authors_text":"Anuroop Sriram, Gabriel Synnaeve, Qiantong Xu, Ronan Collobert, Vineel Pratap","submitted_at":"2020-12-07T01:53:45Z","abstract_excerpt":"This paper introduces Multilingual LibriSpeech (MLS) dataset, a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages, including about 44.5K hours of English and a total of about 6K hours for other languages. Additionally, we provide Language Models (LM) and baseline Automatic Speech Recognition (ASR) models and for all the languages in our dataset. We believe such a large transcribed dataset will open new avenues in ASR and Text-To-Speech (TTS) research. The dataset will be made freely available for anyone"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2012.03411","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"eess.AS","submitted_at":"2020-12-07T01:53:45Z","cross_cats_sorted":["cs.CL","cs.SD"],"title_canon_sha256":"95be499156a7f3b74d8fb945acb50a5ccfdfd1902616855fd890b4675515e396","abstract_canon_sha256":"223f3ee245ddf88d2202bf455454dbcdda1730d0b95cd05f6377c9c985df3073"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T14:49:54.055857Z","signature_b64":"5rqaFaiMy+ROBx2MLYF3H4k9opCaGX28sjNhxpopiKVRfRjRXkhGp01WVfB25vmt4Ouydq1RdETrfd7Lomu9Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4f51eeaf967cc27e697ec2a6d3bcc79a421a11ec618685ddd3eb47e9390ebbe1","last_reissued_at":"2026-05-18T14:49:54.053033Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T14:49:54.053033Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MLS: A Large-Scale Multilingual Dataset for Speech Research","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.SD"],"primary_cat":"eess.AS","authors_text":"Anuroop Sriram, Gabriel Synnaeve, Qiantong Xu, Ronan Collobert, Vineel Pratap","submitted_at":"2020-12-07T01:53:45Z","abstract_excerpt":"This paper introduces Multilingual LibriSpeech (MLS) dataset, a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages, including about 44.5K hours of English and a total of about 6K hours for other languages. Additionally, we provide Language Models (LM) and baseline Automatic Speech Recognition (ASR) models and for all the languages in our dataset. We believe such a large transcribed dataset will open new avenues in ASR and Text-To-Speech (TTS) research. The dataset will be made freely available for anyone"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2012.03411","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2012.03411","created_at":"2026-05-18T14:49:54.053158+00:00"},{"alias_kind":"arxiv_version","alias_value":"2012.03411v2","created_at":"2026-05-18T14:49:54.053158+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2012.03411","created_at":"2026-05-18T14:49:54.053158+00:00"},{"alias_kind":"pith_short_12","alias_value":"J5I65L4WPTBH","created_at":"2026-05-18T14:49:54.053158+00:00"},{"alias_kind":"pith_short_16","alias_value":"J5I65L4WPTBH42L6","created_at":"2026-05-18T14:49:54.053158+00:00"},{"alias_kind":"pith_short_8","alias_value":"J5I65L4W","created_at":"2026-05-18T14:49:54.053158+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":20,"internal_anchor_count":20,"sample":[{"citing_arxiv_id":"2312.11805","citing_title":"Gemini: A Family of Highly Capable Multimodal Models","ref_index":76,"is_internal_anchor":true},{"citing_arxiv_id":"2502.12672","citing_title":"Speech-FT: Merging Pre-trained And Fine-Tuned Speech Representation Models For Cross-Task Generalization","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2512.01537","citing_title":"Two-Dimensional Quantization for Geometry-Aware Audio Coding","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2509.22220","citing_title":"StableToken: A Noise-Robust Semantic Speech Tokenizer for Resilient SpeechLLMs","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2510.03093","citing_title":"Revisiting Direct Speech-to-Text Translation with Speech LLMs: Better Scaling than CoT Prompting?","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2306.12925","citing_title":"AudioPaLM: A Large Language Model That Can Speak and Listen","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2603.02641","citing_title":"Rethinking Training Targets, Architectures and Data Quality for Universal Speech Enhancement","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2507.08128","citing_title":"Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio Language Models","ref_index":95,"is_internal_anchor":true},{"citing_arxiv_id":"2604.01897","citing_title":"FastTurn: Unifying Acoustic and Streaming Semantic Cues for Low-Latency and Robust Turn Detection","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08186","citing_title":"Rethinking Entropy Minimization in Test-Time Adaptation for Autoregressive Models","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2504.18425","citing_title":"Kimi-Audio Technical Report","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01638","citing_title":"Omni-Fake: Benchmarking Unified Multimodal Social Media Deepfake Detection","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18105","citing_title":"NIM4-ASR: Towards Efficient, Robust, and Customizable Real-Time LLM-Based ASR","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09472","citing_title":"Data Selection Effects on Self-Supervised Learning of Audio Representations for French Audiovisual Broadcasts","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11283","citing_title":"Empowering Video Translation using Multimodal Large Language Models","ref_index":201,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08003","citing_title":"Rethinking Entropy Allocation in LLM-based ASR: Understanding the Dynamics between Speech Encoders and LLMs","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06765","citing_title":"VITA-QinYu: Expressive Spoken Language Model for Role-Playing and Singing","ref_index":122,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06694","citing_title":"AudioKV: KV Cache Eviction in Efficient Large Audio Language Models","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22817","citing_title":"In-Sync: Adaptation of Speech Aware Large Language Models for ASR with Word Level Timestamp Predictions","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19330","citing_title":"Text-To-Speech with Chain-of-Details: modeling temporal dynamics in speech generation","ref_index":45,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ","json":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ.json","graph_json":"https://pith.science/api/pith-number/J5I65L4WPTBH42L6YKTNHPGHTJ/graph.json","events_json":"https://pith.science/api/pith-number/J5I65L4WPTBH42L6YKTNHPGHTJ/events.json","paper":"https://pith.science/paper/J5I65L4W"},"agent_actions":{"view_html":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ","download_json":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ.json","view_paper":"https://pith.science/paper/J5I65L4W","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2012.03411&json=true","fetch_graph":"https://pith.science/api/pith-number/J5I65L4WPTBH42L6YKTNHPGHTJ/graph.json","fetch_events":"https://pith.science/api/pith-number/J5I65L4WPTBH42L6YKTNHPGHTJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ/action/storage_attestation","attest_author":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ/action/author_attestation","sign_citation":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ/action/citation_signature","submit_replication":"https://pith.science/pith/J5I65L4WPTBH42L6YKTNHPGHTJ/action/replication_record"}},"created_at":"2026-05-18T14:49:54.053158+00:00","updated_at":"2026-05-18T14:49:54.053158+00:00"}