{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:YLEOVT363VX77ZXKC3E53OCMEI","short_pith_number":"pith:YLEOVT36","schema_version":"1.0","canonical_sha256":"c2c8eacf7edd6fffe6ea16c9ddb84c2203e9bba4bbbe6d6f1b36808010ef6d6d","source":{"kind":"arxiv","id":"1809.00496","version":2},"attestation_state":"computed","paper":{"title":"LRS3-TED: a large-scale dataset for visual speech recognition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andrew Zisserman, Joon Son Chung, Triantafyllos Afouras","submitted_at":"2018-09-03T08:38:34Z","abstract_excerpt":"This paper introduces a new multi-modal dataset for visual and audio-visual speech recognition. It includes face tracks from over 400 hours of TED and TEDx videos, along with the corresponding subtitles and word alignment boundaries. The new dataset is substantially larger in scale compared to other public datasets that are available for general research."},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1809.00496","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-09-03T08:38:34Z","cross_cats_sorted":[],"title_canon_sha256":"87d21d74047ef07e701cedaf6126c43a1cf618d5cdd5a49b3df0e1b08af3ced0","abstract_canon_sha256":"c6d4be0f05bf73037906af6e8f1a059eeabeb7d430925e197968aa7ecc8ae0f9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:02:09.130003Z","signature_b64":"jBRsI1aJ2veGBwhrHOTCHPCi5nRXQi90t1Z9UPI6AGes+YU6XNGqiQcgNk508i7H9gW1wtMqbPZ3N3nSo21xCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c2c8eacf7edd6fffe6ea16c9ddb84c2203e9bba4bbbe6d6f1b36808010ef6d6d","last_reissued_at":"2026-05-18T00:02:09.129403Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:02:09.129403Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LRS3-TED: a large-scale dataset for visual speech recognition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andrew Zisserman, Joon Son Chung, Triantafyllos Afouras","submitted_at":"2018-09-03T08:38:34Z","abstract_excerpt":"This paper introduces a new multi-modal dataset for visual and audio-visual speech recognition. It includes face tracks from over 400 hours of TED and TEDx videos, along with the corresponding subtitles and word alignment boundaries. The new dataset is substantially larger in scale compared to other public datasets that are available for general research."},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1809.00496","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1809.00496","created_at":"2026-05-18T00:02:09.129500+00:00"},{"alias_kind":"arxiv_version","alias_value":"1809.00496v2","created_at":"2026-05-18T00:02:09.129500+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1809.00496","created_at":"2026-05-18T00:02:09.129500+00:00"},{"alias_kind":"pith_short_12","alias_value":"YLEOVT363VX7","created_at":"2026-05-18T12:33:04.347982+00:00"},{"alias_kind":"pith_short_16","alias_value":"YLEOVT363VX77ZXK","created_at":"2026-05-18T12:33:04.347982+00:00"},{"alias_kind":"pith_short_8","alias_value":"YLEOVT36","created_at":"2026-05-18T12:33:04.347982+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":12,"internal_anchor_count":5,"sample":[{"citing_arxiv_id":"1907.04975","citing_title":"My lips are concealed: Audio-visual speech enhancement through obstructions","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2411.17690","citing_title":"Mechanisms of Multimodal Synchronization: Insights from Decoder-Based Video-Text-to-Speech Synthesis","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16918","citing_title":"HighSync: High-Quality Lip Synchronization via Latent Diffusion Models","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2509.16023","citing_title":"Interpreting the Role of Visemes in Audio-Visual Speech Recognition","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2411.15633","citing_title":"Orthogonal Subspace Decomposition for Generalizable AI-Generated Image Detection","ref_index":175,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27866","citing_title":"LRS-VoxMM: A benchmark for in-the-wild audio-visual speech recognition","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2604.27436","citing_title":"BUT System Description for CHiME-9 MCoRec Challenge","ref_index":18,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01673","citing_title":"Delayed Commitment for Representation Readiness in Stage-wise Audio-Visual Learning","ref_index":59,"is_internal_anchor":false},{"citing_arxiv_id":"2604.12292","citing_title":"CoSyncDiT: Cognitive Synchronous Diffusion Transformer for Movie Dubbing","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.08359","citing_title":"Tracking Listener Attention: Gaze-Guided Audio-Visual Speech Enhancement Framework","ref_index":30,"is_internal_anchor":false},{"citing_arxiv_id":"2604.04348","citing_title":"OmniSonic: Towards Universal and Holistic Audio Generation from Video and Text","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.15923","citing_title":"Hierarchical Codec Diffusion for Video-to-Speech Generation","ref_index":1,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI","json":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI.json","graph_json":"https://pith.science/api/pith-number/YLEOVT363VX77ZXKC3E53OCMEI/graph.json","events_json":"https://pith.science/api/pith-number/YLEOVT363VX77ZXKC3E53OCMEI/events.json","paper":"https://pith.science/paper/YLEOVT36"},"agent_actions":{"view_html":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI","download_json":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI.json","view_paper":"https://pith.science/paper/YLEOVT36","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1809.00496&json=true","fetch_graph":"https://pith.science/api/pith-number/YLEOVT363VX77ZXKC3E53OCMEI/graph.json","fetch_events":"https://pith.science/api/pith-number/YLEOVT363VX77ZXKC3E53OCMEI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI/action/storage_attestation","attest_author":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI/action/author_attestation","sign_citation":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI/action/citation_signature","submit_replication":"https://pith.science/pith/YLEOVT363VX77ZXKC3E53OCMEI/action/replication_record"}},"created_at":"2026-05-18T00:02:09.129500+00:00","updated_at":"2026-05-18T00:02:09.129500+00:00"}