{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:NC2BGDBYBT62UIXN4DAOH4BOBN","short_pith_number":"pith:NC2BGDBY","schema_version":"1.0","canonical_sha256":"68b4130c380cfdaa22ede0c0e3f02e0b4885efa321987f907ad51718f7140a47","source":{"kind":"arxiv","id":"2111.08191","version":2},"attestation_state":"computed","paper":{"title":"CoCA-MDD: A Coupled Cross-Attention based Framework for Streaming Mispronunciation Detection and Diagnosis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.SD","eess.AS"],"primary_cat":"cs.CL","authors_text":"Baohua Xu, Liqun Deng, Nianzu Zheng, Qun Liu, Wenyong Huang, Xiao Chen, Xin Jiang, Yasheng Wang, Yuanyuan Guo, Yu Ting Yeung","submitted_at":"2021-11-16T02:17:49Z","abstract_excerpt":"Mispronunciation detection and diagnosis (MDD) is a popular research focus in computer-aided pronunciation training (CAPT) systems. End-to-end (e2e) approaches are becoming dominant in MDD. However an e2e MDD model usually requires entire speech utterances as input context, which leads to significant time latency especially for long paragraphs. We propose a streaming e2e MDD model called CoCA-MDD. We utilize conv-transformer structure to encode input speech in a streaming manner. A coupled cross-attention (CoCA) mechanism is proposed to integrate frame-level acoustic features with encoded refe"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2111.08191","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-11-16T02:17:49Z","cross_cats_sorted":["cs.SD","eess.AS"],"title_canon_sha256":"473c21bcbc94100625a8e88156344cb23d4840fab3a89078512b48dfe3f31d56","abstract_canon_sha256":"fb42f61924145e5e63f7f786d21791bbb712e26432b6bea88e0ab4bf09970333"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T04:35:57.578921Z","signature_b64":"p85YSRMl4/FV8u45ojUWW7AktGcinLRc4rPeAncI0S9VNBQLakMqcn1vPQdzMMCta6hBCuXaVEo+T7+rXnFPDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"68b4130c380cfdaa22ede0c0e3f02e0b4885efa321987f907ad51718f7140a47","last_reissued_at":"2026-07-05T04:35:57.578422Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T04:35:57.578422Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"CoCA-MDD: A Coupled Cross-Attention based Framework for Streaming Mispronunciation Detection and Diagnosis","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.SD","eess.AS"],"primary_cat":"cs.CL","authors_text":"Baohua Xu, Liqun Deng, Nianzu Zheng, Qun Liu, Wenyong Huang, Xiao Chen, Xin Jiang, Yasheng Wang, Yuanyuan Guo, Yu Ting Yeung","submitted_at":"2021-11-16T02:17:49Z","abstract_excerpt":"Mispronunciation detection and diagnosis (MDD) is a popular research focus in computer-aided pronunciation training (CAPT) systems. End-to-end (e2e) approaches are becoming dominant in MDD. However an e2e MDD model usually requires entire speech utterances as input context, which leads to significant time latency especially for long paragraphs. We propose a streaming e2e MDD model called CoCA-MDD. We utilize conv-transformer structure to encode input speech in a streaming manner. A coupled cross-attention (CoCA) mechanism is proposed to integrate frame-level acoustic features with encoded refe"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2111.08191","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2111.08191/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2111.08191","created_at":"2026-07-05T04:35:57.578483+00:00"},{"alias_kind":"arxiv_version","alias_value":"2111.08191v2","created_at":"2026-07-05T04:35:57.578483+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2111.08191","created_at":"2026-07-05T04:35:57.578483+00:00"},{"alias_kind":"pith_short_12","alias_value":"NC2BGDBYBT62","created_at":"2026-07-05T04:35:57.578483+00:00"},{"alias_kind":"pith_short_16","alias_value":"NC2BGDBYBT62UIXN","created_at":"2026-07-05T04:35:57.578483+00:00"},{"alias_kind":"pith_short_8","alias_value":"NC2BGDBY","created_at":"2026-07-05T04:35:57.578483+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN","json":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN.json","graph_json":"https://pith.science/api/pith-number/NC2BGDBYBT62UIXN4DAOH4BOBN/graph.json","events_json":"https://pith.science/api/pith-number/NC2BGDBYBT62UIXN4DAOH4BOBN/events.json","paper":"https://pith.science/paper/NC2BGDBY"},"agent_actions":{"view_html":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN","download_json":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN.json","view_paper":"https://pith.science/paper/NC2BGDBY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2111.08191&json=true","fetch_graph":"https://pith.science/api/pith-number/NC2BGDBYBT62UIXN4DAOH4BOBN/graph.json","fetch_events":"https://pith.science/api/pith-number/NC2BGDBYBT62UIXN4DAOH4BOBN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN/action/storage_attestation","attest_author":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN/action/author_attestation","sign_citation":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN/action/citation_signature","submit_replication":"https://pith.science/pith/NC2BGDBYBT62UIXN4DAOH4BOBN/action/replication_record"}},"created_at":"2026-07-05T04:35:57.578483+00:00","updated_at":"2026-07-05T04:35:57.578483+00:00"}