{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:AHYFUV23QAWK4W7IQGYO476PJP","short_pith_number":"pith:AHYFUV23","schema_version":"1.0","canonical_sha256":"01f05a575b802cae5be881b0ee7fcf4bc5ee7671a2bebb2d107d824fdf252992","source":{"kind":"arxiv","id":"2606.06928","version":1},"attestation_state":"computed","paper":{"title":"VoxCPM2 Technical Report","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["eess.AS"],"primary_cat":"cs.SD","authors_text":"Bingsong Bai, Guoyang Zeng, Jiaheng Wu, Jiancheng Gui, Jiuyang Zhou, Mengyuan Deng, Qundong Shi, Renjie Yu, Runchuan Ye, Weiyue Sun, Xiang Li, Xin Liu, Xudong Shen, Yixuan Zhou, Zhisheng Zhang, Zhiyong Wu, Zhiyuan Liu, Ziyang Wang","submitted_at":"2026-06-05T05:43:15Z","abstract_excerpt":"We present VoxCPM2, a https://info.arxiv.org/help/prep#abstractsfully open-source multilingual and controllable speech generation foundation model that extends the hierarchical diffusion-autoregressive modeling paradigm of VoxCPM. VoxCPM2 advances the framework in three key dimensions: (i) capability, by unifying 30 languages, 9 Chinese dialects, natural-language voice design, style-controllable voice cloning, and high-fidelity continuation cloning within a single backbone; (ii) quality, through an asymmetric AudioVAE that encodes at 16 kHz and reconstructs at 48 kHz, enabling implicit super-r"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.06928","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2026-06-05T05:43:15Z","cross_cats_sorted":["eess.AS"],"title_canon_sha256":"b6f24d0ce8384f54936f282f62ae81b013faaf4ae25ceef600c4add6a79b8d5b","abstract_canon_sha256":"0271ae1498dd51e567c9a9a4819cd1064544bfb8ebb57163ba722572914b3b1e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:04:36.524135Z","signature_b64":"GVjD7Nvbzy5owtnvm6eRgW5LT7HZGwLR/KX42S0Iw2G2i1YgNUL5DKAFVBP9MbAhc+r+q2QRktYZtMEtxpfcAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"01f05a575b802cae5be881b0ee7fcf4bc5ee7671a2bebb2d107d824fdf252992","last_reissued_at":"2026-06-08T01:04:36.523267Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:04:36.523267Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"VoxCPM2 Technical Report","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["eess.AS"],"primary_cat":"cs.SD","authors_text":"Bingsong Bai, Guoyang Zeng, Jiaheng Wu, Jiancheng Gui, Jiuyang Zhou, Mengyuan Deng, Qundong Shi, Renjie Yu, Runchuan Ye, Weiyue Sun, Xiang Li, Xin Liu, Xudong Shen, Yixuan Zhou, Zhisheng Zhang, Zhiyong Wu, Zhiyuan Liu, Ziyang Wang","submitted_at":"2026-06-05T05:43:15Z","abstract_excerpt":"We present VoxCPM2, a https://info.arxiv.org/help/prep#abstractsfully open-source multilingual and controllable speech generation foundation model that extends the hierarchical diffusion-autoregressive modeling paradigm of VoxCPM. VoxCPM2 advances the framework in three key dimensions: (i) capability, by unifying 30 languages, 9 Chinese dialects, natural-language voice design, style-controllable voice cloning, and high-fidelity continuation cloning within a single backbone; (ii) quality, through an asymmetric AudioVAE that encodes at 16 kHz and reconstructs at 48 kHz, enabling implicit super-r"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.06928","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.06928/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.06928","created_at":"2026-06-08T01:04:36.523431+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.06928v1","created_at":"2026-06-08T01:04:36.523431+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.06928","created_at":"2026-06-08T01:04:36.523431+00:00"},{"alias_kind":"pith_short_12","alias_value":"AHYFUV23QAWK","created_at":"2026-06-08T01:04:36.523431+00:00"},{"alias_kind":"pith_short_16","alias_value":"AHYFUV23QAWK4W7I","created_at":"2026-06-08T01:04:36.523431+00:00"},{"alias_kind":"pith_short_8","alias_value":"AHYFUV23","created_at":"2026-06-08T01:04:36.523431+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP","json":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP.json","graph_json":"https://pith.science/api/pith-number/AHYFUV23QAWK4W7IQGYO476PJP/graph.json","events_json":"https://pith.science/api/pith-number/AHYFUV23QAWK4W7IQGYO476PJP/events.json","paper":"https://pith.science/paper/AHYFUV23"},"agent_actions":{"view_html":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP","download_json":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP.json","view_paper":"https://pith.science/paper/AHYFUV23","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.06928&json=true","fetch_graph":"https://pith.science/api/pith-number/AHYFUV23QAWK4W7IQGYO476PJP/graph.json","fetch_events":"https://pith.science/api/pith-number/AHYFUV23QAWK4W7IQGYO476PJP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP/action/storage_attestation","attest_author":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP/action/author_attestation","sign_citation":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP/action/citation_signature","submit_replication":"https://pith.science/pith/AHYFUV23QAWK4W7IQGYO476PJP/action/replication_record"}},"created_at":"2026-06-08T01:04:36.523431+00:00","updated_at":"2026-06-08T01:04:36.523431+00:00"}