{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:HCXY62YPHCZQYYPT3MZO5JI6SM","short_pith_number":"pith:HCXY62YP","canonical_record":{"source":{"id":"2407.05407","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2024-07-07T15:16:19Z","cross_cats_sorted":["cs.AI","eess.AS"],"title_canon_sha256":"c94cc54b5b0e4e4aacad550fe4d4f44213a7a0bed116877c57b077199304a3b1","abstract_canon_sha256":"c81bb0c9503acba49fc092e8e606bd94db76a2fc79ae9ff5f31ad8ba58b9d6ec"},"schema_version":"1.0"},"canonical_sha256":"38af8f6b0f38b30c61f3db32eea51e9318757ac25b5f5667b854c69832896505","source":{"kind":"arxiv","id":"2407.05407","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.05407","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2407.05407v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.05407","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"HCXY62YPHCZQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"HCXY62YPHCZQYYPT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"HCXY62YP","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:HCXY62YPHCZQYYPT3MZO5JI6SM","target":"record","payload":{"canonical_record":{"source":{"id":"2407.05407","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2024-07-07T15:16:19Z","cross_cats_sorted":["cs.AI","eess.AS"],"title_canon_sha256":"c94cc54b5b0e4e4aacad550fe4d4f44213a7a0bed116877c57b077199304a3b1","abstract_canon_sha256":"c81bb0c9503acba49fc092e8e606bd94db76a2fc79ae9ff5f31ad8ba58b9d6ec"},"schema_version":"1.0"},"canonical_sha256":"38af8f6b0f38b30c61f3db32eea51e9318757ac25b5f5667b854c69832896505","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.817690Z","signature_b64":"kHNmVjvinEXOGUi9nklGKH+M09Yrk1UwGcVBDXBw1U1BuX1+2tVaOxh8BBFdkz3fqWalsC4jooFNEm0ArdXYCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"38af8f6b0f38b30c61f3db32eea51e9318757ac25b5f5667b854c69832896505","last_reissued_at":"2026-05-17T23:38:50.817252Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.817252Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2407.05407","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"TUpVdwY4PzdjnUDstUWdMKUKgINpxqRQS5dgKyb4OHZdLu+4Goz1W8iU0C825DR/BUDht1ftplM6BYHCIr76AA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T17:00:07.036086Z"},"content_sha256":"312821a2951c49807a8facbe518d1033f9561c9713eab614c2e0d7979942a156","schema_version":"1.0","event_id":"sha256:312821a2951c49807a8facbe518d1033f9561c9713eab614c2e0d7979942a156"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:HCXY62YPHCZQYYPT3MZO5JI6SM","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Supervised semantic tokens from a multilingual ASR model enable more consistent and similar zero-shot voice cloning than unsupervised tokens in CosyVoice.","cross_cats":["cs.AI","eess.AS"],"primary_cat":"cs.SD","authors_text":"Hangrui Hu, Heng Lu, Kai Hu, Qian Chen, Shiliang Zhang, Siqi Zheng, Yexin Yang, Yue Gu, Zhifu Gao, Zhihao Du, Zhijie Yan, Ziyang Ma","submitted_at":"2024-07-07T15:16:19Z","abstract_excerpt":"Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role in LLM-based TTS models. Current speech tokens are learned in an unsupervised manner, which lacks explicit semantic information and alignment to the text. In this paper, we propose to represent speech"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"Inserting vector quantization into the multilingual ASR encoder produces tokens that retain sufficient semantic, acoustic, and prosodic information for high-quality reconstruction by the conditional flow matching model without major loss.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Supervised semantic tokens from ASR enable CosyVoice to outperform unsupervised tokens in zero-shot multilingual TTS via LLM text-to-token and flow-matching token-to-speech synthesis.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Supervised semantic tokens from a multilingual ASR model enable more consistent and similar zero-shot voice cloning than unsupervised tokens in CosyVoice.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"be57b4bfd5676f70ba50f6d5b9bc37cc2c707b76ac0679f8b8e33c33a123f5f3"},"source":{"id":"2407.05407","kind":"arxiv","version":2},"verdict":{"id":"07249c92-9077-441a-a1a9-2e2784331e63","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T16:56:19.725726Z","strongest_claim":"supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning","one_line_summary":"Supervised semantic tokens from ASR enable CosyVoice to outperform unsupervised tokens in zero-shot multilingual TTS via LLM text-to-token and flow-matching token-to-speech synthesis.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"Inserting vector quantization into the multilingual ASR encoder produces tokens that retain sufficient semantic, acoustic, and prosodic information for high-quality reconstruction by the conditional flow matching model without major loss.","pith_extraction_headline":"Supervised semantic tokens from a multilingual ASR model enable more consistent and similar zero-shot voice cloning than unsupervised tokens in CosyVoice."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"fb6e657ac52cd7a443d2b7d0739a6b895e6e1a46f6b205257b59eb5dcad06585"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"07249c92-9077-441a-a1a9-2e2784331e63"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kUX2MfMzsvKb2STKcf1gmSZu2oui9yX9Cnos7nKaWjof35XbpMwRrucQWKXi4Rk2NN3kL3GYgt8GtNGSoxYPCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T17:00:07.036904Z"},"content_sha256":"03ae9787380776e9e9dc09feda701d808ff454e8ac4ca0d20db3b4fea9b4e691","schema_version":"1.0","event_id":"sha256:03ae9787380776e9e9dc09feda701d808ff454e8ac4ca0d20db3b4fea9b4e691"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/HCXY62YPHCZQYYPT3MZO5JI6SM/bundle.json","state_url":"https://pith.science/pith/HCXY62YPHCZQYYPT3MZO5JI6SM/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/HCXY62YPHCZQYYPT3MZO5JI6SM/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T17:00:07Z","links":{"resolver":"https://pith.science/pith/HCXY62YPHCZQYYPT3MZO5JI6SM","bundle":"https://pith.science/pith/HCXY62YPHCZQYYPT3MZO5JI6SM/bundle.json","state":"https://pith.science/pith/HCXY62YPHCZQYYPT3MZO5JI6SM/state.json","well_known_bundle":"https://pith.science/.well-known/pith/HCXY62YPHCZQYYPT3MZO5JI6SM/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:HCXY62YPHCZQYYPT3MZO5JI6SM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c81bb0c9503acba49fc092e8e606bd94db76a2fc79ae9ff5f31ad8ba58b9d6ec","cross_cats_sorted":["cs.AI","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2024-07-07T15:16:19Z","title_canon_sha256":"c94cc54b5b0e4e4aacad550fe4d4f44213a7a0bed116877c57b077199304a3b1"},"schema_version":"1.0","source":{"id":"2407.05407","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2407.05407","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2407.05407v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2407.05407","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"HCXY62YPHCZQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"HCXY62YPHCZQYYPT","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"HCXY62YP","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:03ae9787380776e9e9dc09feda701d808ff454e8ac4ca0d20db3b4fea9b4e691","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Inserting vector quantization into the multilingual ASR encoder produces tokens that retain sufficient semantic, acoustic, and prosodic information for high-quality reconstruction by the conditional flow matching model without major loss."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Supervised semantic tokens from ASR enable CosyVoice to outperform unsupervised tokens in zero-shot multilingual TTS via LLM text-to-token and flow-matching token-to-speech synthesis."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Supervised semantic tokens from a multilingual ASR model enable more consistent and similar zero-shot voice cloning than unsupervised tokens in CosyVoice."}],"snapshot_sha256":"be57b4bfd5676f70ba50f6d5b9bc37cc2c707b76ac0679f8b8e33c33a123f5f3"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"fb6e657ac52cd7a443d2b7d0739a6b895e6e1a46f6b205257b59eb5dcad06585"},"paper":{"abstract_excerpt":"Recent years have witnessed a trend that large language model (LLM) based text-to-speech (TTS) emerges into the mainstream due to their high naturalness and zero-shot capacity. In this paradigm, speech signals are discretized into token sequences, which are modeled by an LLM with text as prompts and reconstructed by a token-based vocoder to waveforms. Obviously, speech tokens play a critical role in LLM-based TTS models. Current speech tokens are learned in an unsupervised manner, which lacks explicit semantic information and alignment to the text. In this paper, we propose to represent speech","authors_text":"Hangrui Hu, Heng Lu, Kai Hu, Qian Chen, Shiliang Zhang, Siqi Zheng, Yexin Yang, Yue Gu, Zhifu Gao, Zhihao Du, Zhijie Yan, Ziyang Ma","cross_cats":["cs.AI","eess.AS"],"headline":"Supervised semantic tokens from a multilingual ASR model enable more consistent and similar zero-shot voice cloning than unsupervised tokens in CosyVoice.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2024-07-07T15:16:19Z","title":"CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2407.05407","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T16:56:19.725726Z","id":"07249c92-9077-441a-a1a9-2e2784331e63","model_set":{"reader":"grok-4.3"},"one_line_summary":"Supervised semantic tokens from ASR enable CosyVoice to outperform unsupervised tokens in zero-shot multilingual TTS via LLM text-to-token and flow-matching token-to-speech synthesis.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Supervised semantic tokens from a multilingual ASR model enable more consistent and similar zero-shot voice cloning than unsupervised tokens in CosyVoice.","strongest_claim":"supervised semantic tokens significantly outperform existing unsupervised tokens in terms of content consistency and speaker similarity for zero-shot voice cloning","weakest_assumption":"Inserting vector quantization into the multilingual ASR encoder produces tokens that retain sufficient semantic, acoustic, and prosodic information for high-quality reconstruction by the conditional flow matching model without major loss."}},"verdict_id":"07249c92-9077-441a-a1a9-2e2784331e63"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:312821a2951c49807a8facbe518d1033f9561c9713eab614c2e0d7979942a156","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c81bb0c9503acba49fc092e8e606bd94db76a2fc79ae9ff5f31ad8ba58b9d6ec","cross_cats_sorted":["cs.AI","eess.AS"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2024-07-07T15:16:19Z","title_canon_sha256":"c94cc54b5b0e4e4aacad550fe4d4f44213a7a0bed116877c57b077199304a3b1"},"schema_version":"1.0","source":{"id":"2407.05407","kind":"arxiv","version":2}},"canonical_sha256":"38af8f6b0f38b30c61f3db32eea51e9318757ac25b5f5667b854c69832896505","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"38af8f6b0f38b30c61f3db32eea51e9318757ac25b5f5667b854c69832896505","first_computed_at":"2026-05-17T23:38:50.817252Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.817252Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"kHNmVjvinEXOGUi9nklGKH+M09Yrk1UwGcVBDXBw1U1BuX1+2tVaOxh8BBFdkz3fqWalsC4jooFNEm0ArdXYCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.817690Z","signed_message":"canonical_sha256_bytes"},"source_id":"2407.05407","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:312821a2951c49807a8facbe518d1033f9561c9713eab614c2e0d7979942a156","sha256:03ae9787380776e9e9dc09feda701d808ff454e8ac4ca0d20db3b4fea9b4e691"],"state_sha256":"63e2ebdc0c1ac3f6e763ce45956d48286f8292a060f854103ffb01494a0d4500"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"XfRS7Rk1lhOFv/Ka3Q6b9hoejFP2Re5OrnlKLafMBHF85XIF5NCVdZQVmcSm4pZb0ft/nZTjiwLLYd8VzatiAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T17:00:07.039944Z","bundle_sha256":"ddeabc44bd15e849abfbc42e3dbe0351725f9c7846852a564bbe21d5644a52f9"}}