{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:AMGX6GZDTJSPKWPJDMBKO663G3","short_pith_number":"pith:AMGX6GZD","schema_version":"1.0","canonical_sha256":"030d7f1b239a64f559e91b02a77bdb36d124a223e06c8bfc72337d21cef8d2e2","source":{"kind":"arxiv","id":"2606.19209","version":2},"attestation_state":"computed","paper":{"title":"FineCombo-TTS: Collaborative and Precise Controllable Speech Synthesis Using Text Descriptions and Reference Speech","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.SD","authors_text":"Peiji Yang, Shuoyi Zhou, Yicheng Zhong, Yifan Hu, Yixuan Zhou, Zhisheng Wang, Zhiyong Wu","submitted_at":"2026-06-17T15:45:43Z","abstract_excerpt":"Controllable text-to-speech (TTS) has become a key research focus. However, methods based on either reference speech or text descriptions lack flexibility and precise control, and recent joint approaches remain loosely coupled, with speech modeling timbre and text controlling global style. We propose FineCombo-TTS, a unified framework for speech synthesis grounded in reference speech and guided by text descriptions, enabling flexible and precise control over acoustic attributes. Instead of explicit attribute disentanglement, we learn a unified acoustic representation and introduce a Conditiona"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.19209","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.SD","submitted_at":"2026-06-17T15:45:43Z","cross_cats_sorted":[],"title_canon_sha256":"beef8bb012740831eb22cf54aae04e6deebc772e6de8cb57a90e96243b18bf6b","abstract_canon_sha256":"c5a6a6868dacf90c14af7c1761b67930ab53673db3d73e9296d949b3791cf46f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:12:58.563432Z","signature_b64":"qfWioCzkK8DYvgRxGe5Be3KpssWAKcN+ljc8+SrMBPKpoA6YCDZqdsvslgMDE3my3nkndGSyN0xLKxItFlbFAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"030d7f1b239a64f559e91b02a77bdb36d124a223e06c8bfc72337d21cef8d2e2","last_reissued_at":"2026-06-19T16:12:58.563038Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:12:58.563038Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FineCombo-TTS: Collaborative and Precise Controllable Speech Synthesis Using Text Descriptions and Reference Speech","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.SD","authors_text":"Peiji Yang, Shuoyi Zhou, Yicheng Zhong, Yifan Hu, Yixuan Zhou, Zhisheng Wang, Zhiyong Wu","submitted_at":"2026-06-17T15:45:43Z","abstract_excerpt":"Controllable text-to-speech (TTS) has become a key research focus. However, methods based on either reference speech or text descriptions lack flexibility and precise control, and recent joint approaches remain loosely coupled, with speech modeling timbre and text controlling global style. We propose FineCombo-TTS, a unified framework for speech synthesis grounded in reference speech and guided by text descriptions, enabling flexible and precise control over acoustic attributes. Instead of explicit attribute disentanglement, we learn a unified acoustic representation and introduce a Conditiona"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.19209","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.19209/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.19209","created_at":"2026-06-19T16:12:58.563108+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.19209v2","created_at":"2026-06-19T16:12:58.563108+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.19209","created_at":"2026-06-19T16:12:58.563108+00:00"},{"alias_kind":"pith_short_12","alias_value":"AMGX6GZDTJSP","created_at":"2026-06-19T16:12:58.563108+00:00"},{"alias_kind":"pith_short_16","alias_value":"AMGX6GZDTJSPKWPJ","created_at":"2026-06-19T16:12:58.563108+00:00"},{"alias_kind":"pith_short_8","alias_value":"AMGX6GZD","created_at":"2026-06-19T16:12:58.563108+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3","json":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3.json","graph_json":"https://pith.science/api/pith-number/AMGX6GZDTJSPKWPJDMBKO663G3/graph.json","events_json":"https://pith.science/api/pith-number/AMGX6GZDTJSPKWPJDMBKO663G3/events.json","paper":"https://pith.science/paper/AMGX6GZD"},"agent_actions":{"view_html":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3","download_json":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3.json","view_paper":"https://pith.science/paper/AMGX6GZD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.19209&json=true","fetch_graph":"https://pith.science/api/pith-number/AMGX6GZDTJSPKWPJDMBKO663G3/graph.json","fetch_events":"https://pith.science/api/pith-number/AMGX6GZDTJSPKWPJDMBKO663G3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3/action/storage_attestation","attest_author":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3/action/author_attestation","sign_citation":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3/action/citation_signature","submit_replication":"https://pith.science/pith/AMGX6GZDTJSPKWPJDMBKO663G3/action/replication_record"}},"created_at":"2026-06-19T16:12:58.563108+00:00","updated_at":"2026-06-19T16:12:58.563108+00:00"}