{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:F5776M4ZSPX4YTWUKU3MXKTEVO","short_pith_number":"pith:F5776M4Z","canonical_record":{"source":{"id":"2510.13293","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-10-15T08:37:16Z","cross_cats_sorted":[],"title_canon_sha256":"dc34796605ac654dc68f83bf355882049062e59deb2c319530eca3b5f83f4414","abstract_canon_sha256":"7dfad2d64f20900e50b2d03ff123953e260a7d036311d5f8ccbc3b2bae479cc4"},"schema_version":"1.0"},"canonical_sha256":"2f7fff339993efcc4ed45536cbaa64abb0a137e1c601b53c9fe86fd47b6a8e84","source":{"kind":"arxiv","id":"2510.13293","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.13293","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"arxiv_version","alias_value":"2510.13293v3","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.13293","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"pith_short_12","alias_value":"F5776M4ZSPX4","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"pith_short_16","alias_value":"F5776M4ZSPX4YTWU","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"pith_short_8","alias_value":"F5776M4Z","created_at":"2026-05-20T01:05:00Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:F5776M4ZSPX4YTWUKU3MXKTEVO","target":"record","payload":{"canonical_record":{"source":{"id":"2510.13293","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-10-15T08:37:16Z","cross_cats_sorted":[],"title_canon_sha256":"dc34796605ac654dc68f83bf355882049062e59deb2c319530eca3b5f83f4414","abstract_canon_sha256":"7dfad2d64f20900e50b2d03ff123953e260a7d036311d5f8ccbc3b2bae479cc4"},"schema_version":"1.0"},"canonical_sha256":"2f7fff339993efcc4ed45536cbaa64abb0a137e1c601b53c9fe86fd47b6a8e84","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:00.121390Z","signature_b64":"JGaXGBk/escI2rCVHXEW1R7F6ympeO5s6GjCcqiOyWm7iulSTXw0SRSKvDsphIKzEpgHhgjnlO1iuXPDwoVmAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2f7fff339993efcc4ed45536cbaa64abb0a137e1c601b53c9fe86fd47b6a8e84","last_reissued_at":"2026-05-20T01:05:00.120309Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:00.120309Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2510.13293","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"MrKXNXD7oIvq1pmY4+O3N7oYy5w7PIaoa7QVSMlgdOCgOyF3OI/niTg3jsHBalU+H/mgU4og3/KPkuwDMIO4Dg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T09:40:18.268012Z"},"content_sha256":"4591b4aa265cc9f4812e680e5a6e99c9c6558e3a2b4e6208a0f00eee72cce779","schema_version":"1.0","event_id":"sha256:4591b4aa265cc9f4812e680e5a6e99c9c6558e3a2b4e6208a0f00eee72cce779"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:F5776M4ZSPX4YTWUKU3MXKTEVO","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Cross-modal Consistency Guidance for Robust Emotion Control in Auto-Regressive TTS Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"An adaptive guidance scheme detects and compensates for mismatches between desired emotions and text meaning to enable better emotional control in auto-regressive text-to-speech models.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Bin Ma, Chongjia Ni, Chong Zhang, Eng Siong Chng, Yi-Wen Chao, Yizhou Peng, Yukun Ma","submitted_at":"2025-10-15T08:37:16Z","abstract_excerpt":"While Text-to-Speech (TTS) systems enable emotional control via natural-language instructions, expressiveness, naturalness, and speech quality degrade when the target emotion conflicts with the textual semantics. We propose a Cross-modal Consistency Guided Classifier-Free Guidance (CCG-CFG) method with dynamic scales based on the degree of inconsistency between the text emotion and the explicit speech emotion, replacing the dropout condition with the text emotion. We also distill the CCG-CFG guidance signal using a hard-sample mining strategy, improving the TTS model's emotional alignment capa"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our results demonstrate that the proposed adaptive CFG scheme improves the emotional expressiveness of the AR TTS model while maintaining audio quality and intelligibility.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That mismatch between the desired emotion style prompt and the semantic content of the text can be reliably detected and quantified by large language models or natural language inference models in a manner that permits effective, quality-preserving adaptation of CFG strength.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"An adaptive CFG method that tunes guidance based on LLM-detected mismatch between emotion prompts and text semantics improves emotional expressiveness in AR TTS while preserving audio quality and intelligibility.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"An adaptive guidance scheme detects and compensates for mismatches between desired emotions and text meaning to enable better emotional control in auto-regressive text-to-speech models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7520e382ebf9426d182d37ced73e729f846662448ed59f43444fd3bda1c7323b"},"source":{"id":"2510.13293","kind":"arxiv","version":3},"verdict":{"id":"a81fe482-6237-4bdc-9343-2f5485128619","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-18T07:39:32.874441Z","strongest_claim":"Our results demonstrate that the proposed adaptive CFG scheme improves the emotional expressiveness of the AR TTS model while maintaining audio quality and intelligibility.","one_line_summary":"An adaptive CFG method that tunes guidance based on LLM-detected mismatch between emotion prompts and text semantics improves emotional expressiveness in AR TTS while preserving audio quality and intelligibility.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That mismatch between the desired emotion style prompt and the semantic content of the text can be reliably detected and quantified by large language models or natural language inference models in a manner that permits effective, quality-preserving adaptation of CFG strength.","pith_extraction_headline":"An adaptive guidance scheme detects and compensates for mismatches between desired emotions and text meaning to enable better emotional control in auto-regressive text-to-speech models."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.13293/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"7937815d178f3191c45063b2bc9b413e8267be2b718b9892d3484cd4bae5fb09"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"a81fe482-6237-4bdc-9343-2f5485128619"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"JlTXEPy2qdB/wzGVma4r3Rb9T69wHIcEHz5NnPdYDJQsQPe6HMXa0ZvDd4v9KaAnUJV2FsETXCrcmjwTRy7fDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T09:40:18.268887Z"},"content_sha256":"1da3a8f26d41d51912b262763b1841efaa9843cf6a48f56101239fa8a773739f","schema_version":"1.0","event_id":"sha256:1da3a8f26d41d51912b262763b1841efaa9843cf6a48f56101239fa8a773739f"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/F5776M4ZSPX4YTWUKU3MXKTEVO/bundle.json","state_url":"https://pith.science/pith/F5776M4ZSPX4YTWUKU3MXKTEVO/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/F5776M4ZSPX4YTWUKU3MXKTEVO/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T09:40:18Z","links":{"resolver":"https://pith.science/pith/F5776M4ZSPX4YTWUKU3MXKTEVO","bundle":"https://pith.science/pith/F5776M4ZSPX4YTWUKU3MXKTEVO/bundle.json","state":"https://pith.science/pith/F5776M4ZSPX4YTWUKU3MXKTEVO/state.json","well_known_bundle":"https://pith.science/.well-known/pith/F5776M4ZSPX4YTWUKU3MXKTEVO/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:F5776M4ZSPX4YTWUKU3MXKTEVO","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7dfad2d64f20900e50b2d03ff123953e260a7d036311d5f8ccbc3b2bae479cc4","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-10-15T08:37:16Z","title_canon_sha256":"dc34796605ac654dc68f83bf355882049062e59deb2c319530eca3b5f83f4414"},"schema_version":"1.0","source":{"id":"2510.13293","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.13293","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"arxiv_version","alias_value":"2510.13293v3","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.13293","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"pith_short_12","alias_value":"F5776M4ZSPX4","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"pith_short_16","alias_value":"F5776M4ZSPX4YTWU","created_at":"2026-05-20T01:05:00Z"},{"alias_kind":"pith_short_8","alias_value":"F5776M4Z","created_at":"2026-05-20T01:05:00Z"}],"graph_snapshots":[{"event_id":"sha256:1da3a8f26d41d51912b262763b1841efaa9843cf6a48f56101239fa8a773739f","target":"graph","created_at":"2026-05-20T01:05:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our results demonstrate that the proposed adaptive CFG scheme improves the emotional expressiveness of the AR TTS model while maintaining audio quality and intelligibility."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That mismatch between the desired emotion style prompt and the semantic content of the text can be reliably detected and quantified by large language models or natural language inference models in a manner that permits effective, quality-preserving adaptation of CFG strength."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"An adaptive CFG method that tunes guidance based on LLM-detected mismatch between emotion prompts and text semantics improves emotional expressiveness in AR TTS while preserving audio quality and intelligibility."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"An adaptive guidance scheme detects and compensates for mismatches between desired emotions and text meaning to enable better emotional control in auto-regressive text-to-speech models."}],"snapshot_sha256":"7520e382ebf9426d182d37ced73e729f846662448ed59f43444fd3bda1c7323b"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"7937815d178f3191c45063b2bc9b413e8267be2b718b9892d3484cd4bae5fb09"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2510.13293/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"While Text-to-Speech (TTS) systems enable emotional control via natural-language instructions, expressiveness, naturalness, and speech quality degrade when the target emotion conflicts with the textual semantics. We propose a Cross-modal Consistency Guided Classifier-Free Guidance (CCG-CFG) method with dynamic scales based on the degree of inconsistency between the text emotion and the explicit speech emotion, replacing the dropout condition with the text emotion. We also distill the CCG-CFG guidance signal using a hard-sample mining strategy, improving the TTS model's emotional alignment capa","authors_text":"Bin Ma, Chongjia Ni, Chong Zhang, Eng Siong Chng, Yi-Wen Chao, Yizhou Peng, Yukun Ma","cross_cats":[],"headline":"An adaptive guidance scheme detects and compensates for mismatches between desired emotions and text meaning to enable better emotional control in auto-regressive text-to-speech models.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-10-15T08:37:16Z","title":"Cross-modal Consistency Guidance for Robust Emotion Control in Auto-Regressive TTS Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.13293","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-18T07:39:32.874441Z","id":"a81fe482-6237-4bdc-9343-2f5485128619","model_set":{"reader":"grok-4.3"},"one_line_summary":"An adaptive CFG method that tunes guidance based on LLM-detected mismatch between emotion prompts and text semantics improves emotional expressiveness in AR TTS while preserving audio quality and intelligibility.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"An adaptive guidance scheme detects and compensates for mismatches between desired emotions and text meaning to enable better emotional control in auto-regressive text-to-speech models.","strongest_claim":"Our results demonstrate that the proposed adaptive CFG scheme improves the emotional expressiveness of the AR TTS model while maintaining audio quality and intelligibility.","weakest_assumption":"That mismatch between the desired emotion style prompt and the semantic content of the text can be reliably detected and quantified by large language models or natural language inference models in a manner that permits effective, quality-preserving adaptation of CFG strength."}},"verdict_id":"a81fe482-6237-4bdc-9343-2f5485128619"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4591b4aa265cc9f4812e680e5a6e99c9c6558e3a2b4e6208a0f00eee72cce779","target":"record","created_at":"2026-05-20T01:05:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7dfad2d64f20900e50b2d03ff123953e260a7d036311d5f8ccbc3b2bae479cc4","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-10-15T08:37:16Z","title_canon_sha256":"dc34796605ac654dc68f83bf355882049062e59deb2c319530eca3b5f83f4414"},"schema_version":"1.0","source":{"id":"2510.13293","kind":"arxiv","version":3}},"canonical_sha256":"2f7fff339993efcc4ed45536cbaa64abb0a137e1c601b53c9fe86fd47b6a8e84","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"2f7fff339993efcc4ed45536cbaa64abb0a137e1c601b53c9fe86fd47b6a8e84","first_computed_at":"2026-05-20T01:05:00.120309Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:00.120309Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"JGaXGBk/escI2rCVHXEW1R7F6ympeO5s6GjCcqiOyWm7iulSTXw0SRSKvDsphIKzEpgHhgjnlO1iuXPDwoVmAw==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:00.121390Z","signed_message":"canonical_sha256_bytes"},"source_id":"2510.13293","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:4591b4aa265cc9f4812e680e5a6e99c9c6558e3a2b4e6208a0f00eee72cce779","sha256:1da3a8f26d41d51912b262763b1841efaa9843cf6a48f56101239fa8a773739f"],"state_sha256":"1c1de006ac477b800f5878ab33c904c4363e94fce56c535b5712a8fbbfea0032"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VgWW9ikXOlR9f4Tw5KQXkneQwA/zTCOz9NRknXS0TWGe4ASgQegZ3OA1Z6X1qkNkPUtUdwhOa+iiE3l/QuZVCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T09:40:18.272797Z","bundle_sha256":"564fb33c5e90fc5825be389ff78673a865cd04b71fc37725abc82d78329a3744"}}