{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:C3XAY7QLTMYLXV35V3ARFB7POJ","short_pith_number":"pith:C3XAY7QL","schema_version":"1.0","canonical_sha256":"16ee0c7e0b9b30bbd77daec11287ef72523a7a9b2533c82bfbb9e2138f46d75f","source":{"kind":"arxiv","id":"2606.03455","version":1},"attestation_state":"computed","paper":{"title":"WavTTS: Towards High-Quality Zero-Shot TTS via Direct Raw Waveform Modeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"eess.AS","authors_text":"Dongya Jia, Guanrou Yang, Kai Yu, Ruiqi Yan, Sanyuan Chen, Wenxi Chen, Xie Chen, Xiquan Li, Yue Wang, Yushen Chen, Yuzhe Liang, Zhikang Niu, Zhuo Chen, Ziyang Ma","submitted_at":"2026-06-02T10:33:20Z","abstract_excerpt":"Recently, diffusion models operating on VAE latents or mel-spectrograms have become the dominant paradigm for zero-shot TTS. Although these compressed representations improve generation efficiency, they inevitably suffer from information loss and non-end-to-end training. Theoretically, directly modeling raw waveforms circumvents these issues; however, this direction remains underexplored and is often deemed difficult due to the extremely long sequence length of audio signals. To overcome this, we propose WavTTS, the first raw waveform generative TTS model that substantially narrows the gap wit"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.03455","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"eess.AS","submitted_at":"2026-06-02T10:33:20Z","cross_cats_sorted":["cs.SD"],"title_canon_sha256":"180095b132196c8b71f9d6a8fd25f914165ea1b13ba0de75841414d74089253d","abstract_canon_sha256":"f0538302cf378b2fbabd93cb65b0d8110be072110a7f9dd632b8585577f72431"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T01:05:58.365237Z","signature_b64":"xevR1Qa/QdfhUI7usDc+duoDkUjRsDZ0mZaT0aAfDakL6Z4IyCLwor7mhq+L/yo9dw/Xm2S76S5EuLGprH1DBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"16ee0c7e0b9b30bbd77daec11287ef72523a7a9b2533c82bfbb9e2138f46d75f","last_reissued_at":"2026-06-03T01:05:58.364877Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T01:05:58.364877Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"WavTTS: Towards High-Quality Zero-Shot TTS via Direct Raw Waveform Modeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"eess.AS","authors_text":"Dongya Jia, Guanrou Yang, Kai Yu, Ruiqi Yan, Sanyuan Chen, Wenxi Chen, Xie Chen, Xiquan Li, Yue Wang, Yushen Chen, Yuzhe Liang, Zhikang Niu, Zhuo Chen, Ziyang Ma","submitted_at":"2026-06-02T10:33:20Z","abstract_excerpt":"Recently, diffusion models operating on VAE latents or mel-spectrograms have become the dominant paradigm for zero-shot TTS. Although these compressed representations improve generation efficiency, they inevitably suffer from information loss and non-end-to-end training. Theoretically, directly modeling raw waveforms circumvents these issues; however, this direction remains underexplored and is often deemed difficult due to the extremely long sequence length of audio signals. To overcome this, we propose WavTTS, the first raw waveform generative TTS model that substantially narrows the gap wit"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.03455","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.03455/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.03455","created_at":"2026-06-03T01:05:58.364933+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.03455v1","created_at":"2026-06-03T01:05:58.364933+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.03455","created_at":"2026-06-03T01:05:58.364933+00:00"},{"alias_kind":"pith_short_12","alias_value":"C3XAY7QLTMYL","created_at":"2026-06-03T01:05:58.364933+00:00"},{"alias_kind":"pith_short_16","alias_value":"C3XAY7QLTMYLXV35","created_at":"2026-06-03T01:05:58.364933+00:00"},{"alias_kind":"pith_short_8","alias_value":"C3XAY7QL","created_at":"2026-06-03T01:05:58.364933+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ","json":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ.json","graph_json":"https://pith.science/api/pith-number/C3XAY7QLTMYLXV35V3ARFB7POJ/graph.json","events_json":"https://pith.science/api/pith-number/C3XAY7QLTMYLXV35V3ARFB7POJ/events.json","paper":"https://pith.science/paper/C3XAY7QL"},"agent_actions":{"view_html":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ","download_json":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ.json","view_paper":"https://pith.science/paper/C3XAY7QL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.03455&json=true","fetch_graph":"https://pith.science/api/pith-number/C3XAY7QLTMYLXV35V3ARFB7POJ/graph.json","fetch_events":"https://pith.science/api/pith-number/C3XAY7QLTMYLXV35V3ARFB7POJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ/action/storage_attestation","attest_author":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ/action/author_attestation","sign_citation":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ/action/citation_signature","submit_replication":"https://pith.science/pith/C3XAY7QLTMYLXV35V3ARFB7POJ/action/replication_record"}},"created_at":"2026-06-03T01:05:58.364933+00:00","updated_at":"2026-06-03T01:05:58.364933+00:00"}