{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:SHLL6LHOCKXELSCUIJBBDZIWBI","short_pith_number":"pith:SHLL6LHO","schema_version":"1.0","canonical_sha256":"91d6bf2cee12ae45c854424211e5160a3e787c6102281639cfed9cf788f09f81","source":{"kind":"arxiv","id":"1804.03619","version":2},"attestation_state":"computed","paper":{"title":"Looking to Listen at the Cocktail Party: A Speaker-Independent Audio-Visual Model for Speech Separation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","eess.AS"],"primary_cat":"cs.SD","authors_text":"Ariel Ephrat, Avinatan Hassidim, Inbar Mosseri, Kevin Wilson, Michael Rubinstein, Oran Lang, Tali Dekel, William T. Freeman","submitted_at":"2018-04-10T16:28:59Z","abstract_excerpt":"We present a joint audio-visual model for isolating a single speech signal from a mixture of sounds such as other speakers and background noise. Solving this task using only audio as input is extremely challenging and does not provide an association of the separated speech signals with speakers in the video. In this paper, we present a deep network-based model that incorporates both visual and auditory signals to solve this task. The visual features are used to \"focus\" the audio on desired speakers in a scene and to improve the speech separation quality. To train our joint audio-visual model, "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1804.03619","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SD","submitted_at":"2018-04-10T16:28:59Z","cross_cats_sorted":["cs.CV","eess.AS"],"title_canon_sha256":"7ad5ae484fc2da96d58d64cbcd8ea1290973fcd464f870a58522ad6a9b81bf16","abstract_canon_sha256":"f710b730666a32aedc31ef25f0168e1f15000aba681e417885fe94d733e806cf"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:08:27.867008Z","signature_b64":"6hpOEvf4xsiJ+P+AwQlb78lbI/siAmKii847Sbbtk4yafwOMAzI3W62M6Mpk9VHuwoxMqHVjoYEbW4vk6kovDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"91d6bf2cee12ae45c854424211e5160a3e787c6102281639cfed9cf788f09f81","last_reissued_at":"2026-05-18T00:08:27.866520Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:08:27.866520Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Looking to Listen at the Cocktail Party: A Speaker-Independent Audio-Visual Model for Speech Separation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","eess.AS"],"primary_cat":"cs.SD","authors_text":"Ariel Ephrat, Avinatan Hassidim, Inbar Mosseri, Kevin Wilson, Michael Rubinstein, Oran Lang, Tali Dekel, William T. Freeman","submitted_at":"2018-04-10T16:28:59Z","abstract_excerpt":"We present a joint audio-visual model for isolating a single speech signal from a mixture of sounds such as other speakers and background noise. Solving this task using only audio as input is extremely challenging and does not provide an association of the separated speech signals with speakers in the video. In this paper, we present a deep network-based model that incorporates both visual and auditory signals to solve this task. The visual features are used to \"focus\" the audio on desired speakers in a scene and to improve the speech separation quality. To train our joint audio-visual model, "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1804.03619","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1804.03619","created_at":"2026-05-18T00:08:27.866597+00:00"},{"alias_kind":"arxiv_version","alias_value":"1804.03619v2","created_at":"2026-05-18T00:08:27.866597+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1804.03619","created_at":"2026-05-18T00:08:27.866597+00:00"},{"alias_kind":"pith_short_12","alias_value":"SHLL6LHOCKXE","created_at":"2026-05-18T12:32:53.628368+00:00"},{"alias_kind":"pith_short_16","alias_value":"SHLL6LHOCKXELSCU","created_at":"2026-05-18T12:32:53.628368+00:00"},{"alias_kind":"pith_short_8","alias_value":"SHLL6LHO","created_at":"2026-05-18T12:32:53.628368+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2511.02830","citing_title":"Densemarks: Learning Canonical Embeddings for Human Heads Images via Point Tracks","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2512.04677","citing_title":"Live Avatar: Streaming Real-time Audio-Driven Avatar Generation with Infinite Length","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02948","citing_title":"AsymTalker: Identity-Consistent Long-Term Talking Head Generation via Asymmetric Distillation","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02948","citing_title":"AsymTalker: Identity-Consistent Long-Term Talking Head Generation via Asymmetric Distillation","ref_index":12,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI","json":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI.json","graph_json":"https://pith.science/api/pith-number/SHLL6LHOCKXELSCUIJBBDZIWBI/graph.json","events_json":"https://pith.science/api/pith-number/SHLL6LHOCKXELSCUIJBBDZIWBI/events.json","paper":"https://pith.science/paper/SHLL6LHO"},"agent_actions":{"view_html":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI","download_json":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI.json","view_paper":"https://pith.science/paper/SHLL6LHO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1804.03619&json=true","fetch_graph":"https://pith.science/api/pith-number/SHLL6LHOCKXELSCUIJBBDZIWBI/graph.json","fetch_events":"https://pith.science/api/pith-number/SHLL6LHOCKXELSCUIJBBDZIWBI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI/action/storage_attestation","attest_author":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI/action/author_attestation","sign_citation":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI/action/citation_signature","submit_replication":"https://pith.science/pith/SHLL6LHOCKXELSCUIJBBDZIWBI/action/replication_record"}},"created_at":"2026-05-18T00:08:27.866597+00:00","updated_at":"2026-05-18T00:08:27.866597+00:00"}