{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ISEWEFLJIMSJFMU5YTKX45NBXN","short_pith_number":"pith:ISEWEFLJ","schema_version":"1.0","canonical_sha256":"4489621569432492b29dc4d57e75a1bb4162297b668b3c1bfa21f1d3424aee75","source":{"kind":"arxiv","id":"2603.04219","version":2},"attestation_state":"computed","paper":{"title":"ZeSTA: Zero-Shot TTS Augmentation with Domain-Conditioned Training for Data-Efficient Personalized Speech Synthesis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","eess.AS"],"primary_cat":"cs.SD","authors_text":"Hwayeon Kim, Hyeonyu Kim, Jinwoo Oh, Youngwon Choi","submitted_at":"2026-03-04T16:04:02Z","abstract_excerpt":"We investigate the use of zero-shot text-to-speech (ZS-TTS) as a data augmentation source for low-resource personalized speech synthesis. While synthetic augmentation can provide linguistically rich and phonetically diverse speech, naively mixing large amounts of synthetic speech with limited real recordings often leads to speaker similarity degradation during fine-tuning. To address this issue, we propose ZeSTA, a simple domain-conditioned training framework that distinguishes real and synthetic speech via a lightweight domain embedding, combined with real-data oversampling to stabilize adapt"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.04219","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-03-04T16:04:02Z","cross_cats_sorted":["cs.AI","eess.AS"],"title_canon_sha256":"1cb89d48b0569772f76b5ab9b75d68168c75fd788d38ff896a3e5e34a5547ad1","abstract_canon_sha256":"5262b33deb9a081058c72a374ff2c7fe8ea7f285202c47238446764d9e5f1952"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:52.076573Z","signature_b64":"kHDN+JaYJ20oBUH06MHv6VuIiyavCgjflr4MuyWxuPeUZhgedTfr7guE7YIpyUdIxKYxstCjCtz486a/AmVWDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4489621569432492b29dc4d57e75a1bb4162297b668b3c1bfa21f1d3424aee75","last_reissued_at":"2026-06-19T16:12:52.076093Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:52.076093Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"ZeSTA: Zero-Shot TTS Augmentation with Domain-Conditioned Training for Data-Efficient Personalized Speech Synthesis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","eess.AS"],"primary_cat":"cs.SD","authors_text":"Hwayeon Kim, Hyeonyu Kim, Jinwoo Oh, Youngwon Choi","submitted_at":"2026-03-04T16:04:02Z","abstract_excerpt":"We investigate the use of zero-shot text-to-speech (ZS-TTS) as a data augmentation source for low-resource personalized speech synthesis. While synthetic augmentation can provide linguistically rich and phonetically diverse speech, naively mixing large amounts of synthetic speech with limited real recordings often leads to speaker similarity degradation during fine-tuning. To address this issue, we propose ZeSTA, a simple domain-conditioned training framework that distinguishes real and synthetic speech via a lightweight domain embedding, combined with real-data oversampling to stabilize adapt"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.04219","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.04219/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.04219","created_at":"2026-06-19T16:12:52.076157+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.04219v2","created_at":"2026-06-19T16:12:52.076157+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.04219","created_at":"2026-06-19T16:12:52.076157+00:00"},{"alias_kind":"pith_short_12","alias_value":"ISEWEFLJIMSJ","created_at":"2026-06-19T16:12:52.076157+00:00"},{"alias_kind":"pith_short_16","alias_value":"ISEWEFLJIMSJFMU5","created_at":"2026-06-19T16:12:52.076157+00:00"},{"alias_kind":"pith_short_8","alias_value":"ISEWEFLJ","created_at":"2026-06-19T16:12:52.076157+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN","json":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN.json","graph_json":"https://pith.science/api/pith-number/ISEWEFLJIMSJFMU5YTKX45NBXN/graph.json","events_json":"https://pith.science/api/pith-number/ISEWEFLJIMSJFMU5YTKX45NBXN/events.json","paper":"https://pith.science/paper/ISEWEFLJ"},"agent_actions":{"view_html":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN","download_json":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN.json","view_paper":"https://pith.science/paper/ISEWEFLJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.04219&json=true","fetch_graph":"https://pith.science/api/pith-number/ISEWEFLJIMSJFMU5YTKX45NBXN/graph.json","fetch_events":"https://pith.science/api/pith-number/ISEWEFLJIMSJFMU5YTKX45NBXN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN/action/storage_attestation","attest_author":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN/action/author_attestation","sign_citation":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN/action/citation_signature","submit_replication":"https://pith.science/pith/ISEWEFLJIMSJFMU5YTKX45NBXN/action/replication_record"}},"created_at":"2026-06-19T16:12:52.076157+00:00","updated_at":"2026-06-19T16:12:52.076157+00:00"}