{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:MKRYBDAFZRLUQLIPUO3THPG6QV","short_pith_number":"pith:MKRYBDAF","schema_version":"1.0","canonical_sha256":"62a3808c05cc57482d0fa3b733bcde857befe644d43de006128ea978376a6cdf","source":{"kind":"arxiv","id":"1904.02882","version":1},"attestation_state":"computed","paper":{"title":"LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["eess.AS"],"primary_cat":"cs.SD","authors_text":"Heiga Zen, Rob Clark, Ron J. Weiss, Viet Dang, Ye Jia, Yonghui Wu, Yu Zhang, Zhifeng Chen","submitted_at":"2019-04-05T06:05:00Z","abstract_excerpt":"This paper introduces a new speech corpus called \"LibriTTS\" designed for text-to-speech use. It is derived from the original audio and text materials of the LibriSpeech corpus, which has been used for training and evaluating automatic speech recognition systems. The new corpus inherits desired properties of the LibriSpeech corpus while addressing a number of issues which make LibriSpeech less than ideal for text-to-speech work. The released corpus consists of 585 hours of speech data at 24kHz sampling rate from 2,456 speakers and the corresponding texts. Experimental results show that neural e"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1904.02882","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2019-04-05T06:05:00Z","cross_cats_sorted":["eess.AS"],"title_canon_sha256":"b6ead702a36a885c89b36c4a5e85dafd5d45df95e972a4b4bae747434b9764b0","abstract_canon_sha256":"ec6ab07852dcc94bf3f0f2a1907b5a60e8f15b09a331cd6714c378f68e8b2841"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:49:19.356514Z","signature_b64":"140K/Cqt1dQpMWDGYS1Yd0OR2IOffpZhhhhSDJ97wrigYUp+RhXA1UvCcDBi/5Lhf14wGITrblxXjliA/P+gCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"62a3808c05cc57482d0fa3b733bcde857befe644d43de006128ea978376a6cdf","last_reissued_at":"2026-05-17T23:49:19.355924Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:49:19.355924Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["eess.AS"],"primary_cat":"cs.SD","authors_text":"Heiga Zen, Rob Clark, Ron J. Weiss, Viet Dang, Ye Jia, Yonghui Wu, Yu Zhang, Zhifeng Chen","submitted_at":"2019-04-05T06:05:00Z","abstract_excerpt":"This paper introduces a new speech corpus called \"LibriTTS\" designed for text-to-speech use. It is derived from the original audio and text materials of the LibriSpeech corpus, which has been used for training and evaluating automatic speech recognition systems. The new corpus inherits desired properties of the LibriSpeech corpus while addressing a number of issues which make LibriSpeech less than ideal for text-to-speech work. The released corpus consists of 585 hours of speech data at 24kHz sampling rate from 2,456 speakers and the corresponding texts. Experimental results show that neural e"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1904.02882","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1904.02882","created_at":"2026-05-17T23:49:19.356010+00:00"},{"alias_kind":"arxiv_version","alias_value":"1904.02882v1","created_at":"2026-05-17T23:49:19.356010+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1904.02882","created_at":"2026-05-17T23:49:19.356010+00:00"},{"alias_kind":"pith_short_12","alias_value":"MKRYBDAFZRLU","created_at":"2026-05-18T12:33:21.387695+00:00"},{"alias_kind":"pith_short_16","alias_value":"MKRYBDAFZRLUQLIP","created_at":"2026-05-18T12:33:21.387695+00:00"},{"alias_kind":"pith_short_8","alias_value":"MKRYBDAF","created_at":"2026-05-18T12:33:21.387695+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":10,"sample":[{"citing_arxiv_id":"1907.02784","citing_title":"A Methodology for Controlling the Emotional Expressiveness in Synthetic Speech -- a Deep Learning approach","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"1907.04927","citing_title":"Speech bandwidth extension with WaveNet","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2509.03526","citing_title":"Enhancing Speech Large Language Models through Reinforced Behavior Alignment","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2512.01537","citing_title":"Two-Dimensional Quantization for Geometry-Aware Audio Coding","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"2603.17837","citing_title":"The Silent Thought: Modeling Internal Cognition in Full-Duplex Spoken Dialogue Models via Latent Reasoning","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16681","citing_title":"A Survey of Advancing Audio Super-Resolution and Bandwidth Extension from Discriminative to Generative Models","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2012.03411","citing_title":"MLS: A Large-Scale Multilingual Dataset for Speech Research","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2509.22220","citing_title":"StableToken: A Noise-Robust Semantic Speech Tokenizer for Resilient SpeechLLMs","ref_index":83,"is_internal_anchor":true},{"citing_arxiv_id":"2410.06885","citing_title":"F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching","ref_index":153,"is_internal_anchor":true},{"citing_arxiv_id":"2603.17837","citing_title":"The Silent Thought: Modeling Internal Cognition in Full-Duplex Spoken Dialogue Models via Latent Reasoning","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11098","citing_title":"AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling","ref_index":3,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05927","citing_title":"Minimizing Modality Gap from the Input Side: Your Speech LLM Can Be a Prosody-Aware Text LLM","ref_index":57,"is_internal_anchor":false},{"citing_arxiv_id":"2504.18425","citing_title":"Kimi-Audio Technical Report","ref_index":83,"is_internal_anchor":false},{"citing_arxiv_id":"2605.04505","citing_title":"JASTIN: Aligning LLMs for Zero-Shot Audio and Speech Evaluation via Natural Language Instructions","ref_index":40,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06765","citing_title":"VITA-QinYu: Expressive Spoken Language Model for Role-Playing and Singing","ref_index":117,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05927","citing_title":"Minimizing Modality Gap from the Input Side: Your Speech LLM Can Be a Prosody-Aware Text LLM","ref_index":57,"is_internal_anchor":false},{"citing_arxiv_id":"2604.19330","citing_title":"Text-To-Speech with Chain-of-Details: modeling temporal dynamics in speech generation","ref_index":44,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV","json":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV.json","graph_json":"https://pith.science/api/pith-number/MKRYBDAFZRLUQLIPUO3THPG6QV/graph.json","events_json":"https://pith.science/api/pith-number/MKRYBDAFZRLUQLIPUO3THPG6QV/events.json","paper":"https://pith.science/paper/MKRYBDAF"},"agent_actions":{"view_html":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV","download_json":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV.json","view_paper":"https://pith.science/paper/MKRYBDAF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1904.02882&json=true","fetch_graph":"https://pith.science/api/pith-number/MKRYBDAFZRLUQLIPUO3THPG6QV/graph.json","fetch_events":"https://pith.science/api/pith-number/MKRYBDAFZRLUQLIPUO3THPG6QV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV/action/storage_attestation","attest_author":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV/action/author_attestation","sign_citation":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV/action/citation_signature","submit_replication":"https://pith.science/pith/MKRYBDAFZRLUQLIPUO3THPG6QV/action/replication_record"}},"created_at":"2026-05-17T23:49:19.356010+00:00","updated_at":"2026-05-17T23:49:19.356010+00:00"}