{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:KRV4RSGW5II2YYXPV5K4XPMI3P","short_pith_number":"pith:KRV4RSGW","schema_version":"1.0","canonical_sha256":"546bc8c8d6ea11ac62efaf55cbbd88dbd5ebecb0bab8aad066ae27f6cba74ccd","source":{"kind":"arxiv","id":"1703.10135","version":2},"attestation_state":"computed","paper":{"title":"Tacotron: Towards End-to-End Speech Synthesis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.SD"],"primary_cat":"cs.CL","authors_text":"Daisy Stanton, Navdeep Jaitly, Quoc Le, Rif A. Saurous, RJ Skerry-Ryan, Rob Clark, Ron J. Weiss, Samy Bengio, Yannis Agiomyrgiannakis, Ying Xiao, Yonghui Wu, Yuxuan Wang, Zhifeng Chen, Zongheng Yang","submitted_at":"2017-03-29T16:55:13Z","abstract_excerpt":"A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module. Building these components often requires extensive domain expertise and may contain brittle design choices. In this paper, we present Tacotron, an end-to-end generative text-to-speech model that synthesizes speech directly from characters. Given <text, audio> pairs, the model can be trained completely from scratch with random initialization. We present several key techniques to make the sequence-to-sequence framework perform well for this c"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1703.10135","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2017-03-29T16:55:13Z","cross_cats_sorted":["cs.LG","cs.SD"],"title_canon_sha256":"1e88c28ca4a58545fccea024907587e8d3ed14a7ed91a4c3206b946e21690875","abstract_canon_sha256":"f93c43ec9b45696b8367c4fbd15bbc4e2be42b0cc0ffb8b4eba22c932247d738"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:46:51.196935Z","signature_b64":"hd7nzDQTwri0GMzY+hT0s4Vu/21O+2xqWVrCj0RUNWKi8n29pqB44P3l/VbHw9aUFaxtThW9Ur+GNwX/THx/Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"546bc8c8d6ea11ac62efaf55cbbd88dbd5ebecb0bab8aad066ae27f6cba74ccd","last_reissued_at":"2026-05-18T00:46:51.196230Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:46:51.196230Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Tacotron: Towards End-to-End Speech Synthesis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.SD"],"primary_cat":"cs.CL","authors_text":"Daisy Stanton, Navdeep Jaitly, Quoc Le, Rif A. Saurous, RJ Skerry-Ryan, Rob Clark, Ron J. Weiss, Samy Bengio, Yannis Agiomyrgiannakis, Ying Xiao, Yonghui Wu, Yuxuan Wang, Zhifeng Chen, Zongheng Yang","submitted_at":"2017-03-29T16:55:13Z","abstract_excerpt":"A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module. Building these components often requires extensive domain expertise and may contain brittle design choices. In this paper, we present Tacotron, an end-to-end generative text-to-speech model that synthesizes speech directly from characters. Given <text, audio> pairs, the model can be trained completely from scratch with random initialization. We present several key techniques to make the sequence-to-sequence framework perform well for this c"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1703.10135","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1703.10135","created_at":"2026-05-18T00:46:51.196348+00:00"},{"alias_kind":"arxiv_version","alias_value":"1703.10135v2","created_at":"2026-05-18T00:46:51.196348+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1703.10135","created_at":"2026-05-18T00:46:51.196348+00:00"},{"alias_kind":"pith_short_12","alias_value":"KRV4RSGW5II2","created_at":"2026-05-18T12:31:28.150371+00:00"},{"alias_kind":"pith_short_16","alias_value":"KRV4RSGW5II2YYXP","created_at":"2026-05-18T12:31:28.150371+00:00"},{"alias_kind":"pith_short_8","alias_value":"KRV4RSGW","created_at":"2026-05-18T12:31:28.150371+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":14,"internal_anchor_count":11,"sample":[{"citing_arxiv_id":"1906.08977","citing_title":"Singing Voice Synthesis Using Deep Autoregressive Neural Networks for Acoustic Modeling","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"1906.11645","citing_title":"RUSLAN: Russian Spoken Language Corpus for Speech Synthesis","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"1907.01372","citing_title":"Improving Performance of End-to-End ASR on Numeric Sequences","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"1907.07769","citing_title":"Hierarchical Sequence to Sequence Voice Conversion with Limited Data","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"1907.08294","citing_title":"DNN-based Speaker Embedding Using Subjective Inter-speaker Similarity for Multi-speaker Modeling in Speech Synthesis","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2504.14776","citing_title":"Script2Screen: Supporting Dialogue Scriptwriting with Interactive Audiovisual Generation","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2505.16819","citing_title":"Character-Centered Dialogue Generation from Scene-Level Prompts","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2506.23552","citing_title":"JAM-Flow: Joint Audio-Motion Synthesis with Flow Matching","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17152","citing_title":"Multilingual and Multimodal LLMs in the Wild: Building for Low-Resource Languages","ref_index":238,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10931","citing_title":"Asynchronous Reasoning: Training-Free Interactive Thinking LLMs","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2507.16632","citing_title":"Step-Audio 2 Technical Report","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06327","citing_title":"A Novel Automatic Framework for Speaker Drift Detection in Synthesized Speech","ref_index":6,"is_internal_anchor":false},{"citing_arxiv_id":"2605.00861","citing_title":"Voice Mapping of Text-to-Speech Systems: A Metric-Based Approach for Voice Quality Assessment","ref_index":48,"is_internal_anchor":false},{"citing_arxiv_id":"2604.19330","citing_title":"Text-To-Speech with Chain-of-Details: modeling temporal dynamics in speech generation","ref_index":10,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P","json":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P.json","graph_json":"https://pith.science/api/pith-number/KRV4RSGW5II2YYXPV5K4XPMI3P/graph.json","events_json":"https://pith.science/api/pith-number/KRV4RSGW5II2YYXPV5K4XPMI3P/events.json","paper":"https://pith.science/paper/KRV4RSGW"},"agent_actions":{"view_html":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P","download_json":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P.json","view_paper":"https://pith.science/paper/KRV4RSGW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1703.10135&json=true","fetch_graph":"https://pith.science/api/pith-number/KRV4RSGW5II2YYXPV5K4XPMI3P/graph.json","fetch_events":"https://pith.science/api/pith-number/KRV4RSGW5II2YYXPV5K4XPMI3P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P/action/storage_attestation","attest_author":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P/action/author_attestation","sign_citation":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P/action/citation_signature","submit_replication":"https://pith.science/pith/KRV4RSGW5II2YYXPV5K4XPMI3P/action/replication_record"}},"created_at":"2026-05-18T00:46:51.196348+00:00","updated_at":"2026-05-18T00:46:51.196348+00:00"}