{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:MZNTHHUTKUFADF6LUXTN7VYJEF","short_pith_number":"pith:MZNTHHUT","schema_version":"1.0","canonical_sha256":"665b339e93550a0197cba5e6dfd70921718e09a5f994d869b94d3bded39270f9","source":{"kind":"arxiv","id":"1703.10893","version":6},"attestation_state":"computed","paper":{"title":"Audio-Visual Speech Enhancement Using Multimodal Deep Convolutional Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.MM","stat.ML"],"primary_cat":"cs.SD","authors_text":"Hsin-Min Wang, Hsiu-Wen Chang, Jen-Cheng Hou, Syu-Siang Wang, Ying-Hui Lai, Yu Tsao","submitted_at":"2017-03-30T08:59:24Z","abstract_excerpt":"Speech enhancement (SE) aims to reduce noise in speech signals. Most SE techniques focus only on addressing audio information. In this work, inspired by multimodal learning, which utilizes data from different modalities, and the recent success of convolutional neural networks (CNNs) in SE, we propose an audio-visual deep CNNs (AVDCNN) SE model, which incorporates audio and visual streams into a unified network model. We also propose a multi-task learning framework for reconstructing audio and visual signals at the output layer. Precisely speaking, the proposed AVDCNN model is structured as an "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1703.10893","kind":"arxiv","version":6},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2017-03-30T08:59:24Z","cross_cats_sorted":["cs.MM","stat.ML"],"title_canon_sha256":"53f8038de7d61cf5a3a21ea7ec0fc1895acdd9b1a9fa716b59a293496887d962","abstract_canon_sha256":"6e5b65ae8c2b1b0cea5645124fd679174bedb4c51d13a7c69d564e9bf83cc750"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:25:11.801053Z","signature_b64":"Psr+RQ+2pnglpPjB8ZJmxS2z59zd1AJ9wN7+Y6ke+gtap/Blmwmy5mMAgBxiiCq37abmybCGEMxDQJjPZyxdAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"665b339e93550a0197cba5e6dfd70921718e09a5f994d869b94d3bded39270f9","last_reissued_at":"2026-05-18T00:25:11.800571Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:25:11.800571Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Audio-Visual Speech Enhancement Using Multimodal Deep Convolutional Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.MM","stat.ML"],"primary_cat":"cs.SD","authors_text":"Hsin-Min Wang, Hsiu-Wen Chang, Jen-Cheng Hou, Syu-Siang Wang, Ying-Hui Lai, Yu Tsao","submitted_at":"2017-03-30T08:59:24Z","abstract_excerpt":"Speech enhancement (SE) aims to reduce noise in speech signals. Most SE techniques focus only on addressing audio information. In this work, inspired by multimodal learning, which utilizes data from different modalities, and the recent success of convolutional neural networks (CNNs) in SE, we propose an audio-visual deep CNNs (AVDCNN) SE model, which incorporates audio and visual streams into a unified network model. We also propose a multi-task learning framework for reconstructing audio and visual signals at the output layer. Precisely speaking, the proposed AVDCNN model is structured as an "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1703.10893","kind":"arxiv","version":6},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1703.10893","created_at":"2026-05-18T00:25:11.800648+00:00"},{"alias_kind":"arxiv_version","alias_value":"1703.10893v6","created_at":"2026-05-18T00:25:11.800648+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1703.10893","created_at":"2026-05-18T00:25:11.800648+00:00"},{"alias_kind":"pith_short_12","alias_value":"MZNTHHUTKUFA","created_at":"2026-05-18T12:31:31.346846+00:00"},{"alias_kind":"pith_short_16","alias_value":"MZNTHHUTKUFADF6L","created_at":"2026-05-18T12:31:31.346846+00:00"},{"alias_kind":"pith_short_8","alias_value":"MZNTHHUT","created_at":"2026-05-18T12:31:31.346846+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF","json":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF.json","graph_json":"https://pith.science/api/pith-number/MZNTHHUTKUFADF6LUXTN7VYJEF/graph.json","events_json":"https://pith.science/api/pith-number/MZNTHHUTKUFADF6LUXTN7VYJEF/events.json","paper":"https://pith.science/paper/MZNTHHUT"},"agent_actions":{"view_html":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF","download_json":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF.json","view_paper":"https://pith.science/paper/MZNTHHUT","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1703.10893&json=true","fetch_graph":"https://pith.science/api/pith-number/MZNTHHUTKUFADF6LUXTN7VYJEF/graph.json","fetch_events":"https://pith.science/api/pith-number/MZNTHHUTKUFADF6LUXTN7VYJEF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF/action/storage_attestation","attest_author":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF/action/author_attestation","sign_citation":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF/action/citation_signature","submit_replication":"https://pith.science/pith/MZNTHHUTKUFADF6LUXTN7VYJEF/action/replication_record"}},"created_at":"2026-05-18T00:25:11.800648+00:00","updated_at":"2026-05-18T00:25:11.800648+00:00"}