{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:3KZKBTTWR2DS5MXM2SKC3USFCI","short_pith_number":"pith:3KZKBTTW","schema_version":"1.0","canonical_sha256":"dab2a0ce768e872eb2ecd4942dd2451201b413b6ec5a0e106e303757132b3eee","source":{"kind":"arxiv","id":"2107.12710","version":2},"attestation_state":"computed","paper":{"title":"End-to-End Spectro-Temporal Graph Attention Networks for Speaker Verification Anti-Spoofing and Speech Deepfake Detection","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"eess.AS","authors_text":"Hemlata Tak, Jee-weon Jung, Jose Patino, Madhu Kamble, Massimiliano Todisco, Nicholas Evans","submitted_at":"2021-07-27T10:11:41Z","abstract_excerpt":"Artefacts that serve to distinguish bona fide speech from spoofed or deepfake speech are known to reside in specific subbands and temporal segments. Various approaches can be used to capture and model such artefacts, however, none works well across a spectrum of diverse spoofing attacks. Reliable detection then often depends upon the fusion of multiple detection systems, each tuned to detect different forms of attack. In this paper we show that better performance can be achieved when the fusion is performed within the model itself and when the representation is learned automatically from raw w"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2107.12710","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2021-07-27T10:11:41Z","cross_cats_sorted":["cs.SD"],"title_canon_sha256":"f6f88887764e3aac97a31c13c843757fbc0889878b57895d72b494d0e009ebcd","abstract_canon_sha256":"8cd69df8fea18f7896efbcbb2fc2dc5b757f8d5fbfd9b094f325a8685bda7255"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T03:07:50.834266Z","signature_b64":"EFdARtubOvafwmEOQsPdoDGFFFz0YUNOEiLulXppFdrn1hRRZzcV2NW3dtA2vGk8X/j/a3CQhEAgFRzEqjpNCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dab2a0ce768e872eb2ecd4942dd2451201b413b6ec5a0e106e303757132b3eee","last_reissued_at":"2026-07-05T03:07:50.833834Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T03:07:50.833834Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"End-to-End Spectro-Temporal Graph Attention Networks for Speaker Verification Anti-Spoofing and Speech Deepfake Detection","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.SD"],"primary_cat":"eess.AS","authors_text":"Hemlata Tak, Jee-weon Jung, Jose Patino, Madhu Kamble, Massimiliano Todisco, Nicholas Evans","submitted_at":"2021-07-27T10:11:41Z","abstract_excerpt":"Artefacts that serve to distinguish bona fide speech from spoofed or deepfake speech are known to reside in specific subbands and temporal segments. Various approaches can be used to capture and model such artefacts, however, none works well across a spectrum of diverse spoofing attacks. Reliable detection then often depends upon the fusion of multiple detection systems, each tuned to detect different forms of attack. In this paper we show that better performance can be achieved when the fusion is performed within the model itself and when the representation is learned automatically from raw w"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2107.12710","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2107.12710/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2107.12710","created_at":"2026-07-05T03:07:50.833889+00:00"},{"alias_kind":"arxiv_version","alias_value":"2107.12710v2","created_at":"2026-07-05T03:07:50.833889+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2107.12710","created_at":"2026-07-05T03:07:50.833889+00:00"},{"alias_kind":"pith_short_12","alias_value":"3KZKBTTWR2DS","created_at":"2026-07-05T03:07:50.833889+00:00"},{"alias_kind":"pith_short_16","alias_value":"3KZKBTTWR2DS5MXM","created_at":"2026-07-05T03:07:50.833889+00:00"},{"alias_kind":"pith_short_8","alias_value":"3KZKBTTW","created_at":"2026-07-05T03:07:50.833889+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.30780","citing_title":"Detecting Audio Deepfakes on the Edge:Lightweight SSL-Based Detection in a Browser Plugin","ref_index":4,"is_internal_anchor":false},{"citing_arxiv_id":"2401.09512","citing_title":"MLAAD: The Multi-Language Audio Anti-Spoofing Dataset","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"2408.05366","citing_title":"The DeepSpeak Dataset","ref_index":50,"is_internal_anchor":false},{"citing_arxiv_id":"2605.17737","citing_title":"Profiling the Voice: Speaker-Specific Phoneme Fingerprinting for Speech Deepfake Detection","ref_index":17,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI","json":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI.json","graph_json":"https://pith.science/api/pith-number/3KZKBTTWR2DS5MXM2SKC3USFCI/graph.json","events_json":"https://pith.science/api/pith-number/3KZKBTTWR2DS5MXM2SKC3USFCI/events.json","paper":"https://pith.science/paper/3KZKBTTW"},"agent_actions":{"view_html":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI","download_json":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI.json","view_paper":"https://pith.science/paper/3KZKBTTW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2107.12710&json=true","fetch_graph":"https://pith.science/api/pith-number/3KZKBTTWR2DS5MXM2SKC3USFCI/graph.json","fetch_events":"https://pith.science/api/pith-number/3KZKBTTWR2DS5MXM2SKC3USFCI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI/action/storage_attestation","attest_author":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI/action/author_attestation","sign_citation":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI/action/citation_signature","submit_replication":"https://pith.science/pith/3KZKBTTWR2DS5MXM2SKC3USFCI/action/replication_record"}},"created_at":"2026-07-05T03:07:50.833889+00:00","updated_at":"2026-07-05T03:07:50.833889+00:00"}