{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:ISH6UIVORAVBZEMMQARU4NWKCB","short_pith_number":"pith:ISH6UIVO","canonical_record":{"source":{"id":"1812.09336","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-12-21T19:02:52Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"bc1940423bee4da71c5767a042e6604be730da681ad09662306422e04b729939","abstract_canon_sha256":"3108904f648c873ff4b2167374cfd451ff3acee8bdbe0cd9f6c38cce225707f5"},"schema_version":"1.0"},"canonical_sha256":"448fea22ae882a1c918c80234e36ca1079df4f9bbab54c66790f597a203c25fc","source":{"kind":"arxiv","id":"1812.09336","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.09336","created_at":"2026-05-17T23:57:30Z"},{"alias_kind":"arxiv_version","alias_value":"1812.09336v1","created_at":"2026-05-17T23:57:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.09336","created_at":"2026-05-17T23:57:30Z"},{"alias_kind":"pith_short_12","alias_value":"ISH6UIVORAVB","created_at":"2026-05-18T12:32:31Z"},{"alias_kind":"pith_short_16","alias_value":"ISH6UIVORAVBZEMM","created_at":"2026-05-18T12:32:31Z"},{"alias_kind":"pith_short_8","alias_value":"ISH6UIVO","created_at":"2026-05-18T12:32:31Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:ISH6UIVORAVBZEMMQARU4NWKCB","target":"record","payload":{"canonical_record":{"source":{"id":"1812.09336","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-12-21T19:02:52Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"bc1940423bee4da71c5767a042e6604be730da681ad09662306422e04b729939","abstract_canon_sha256":"3108904f648c873ff4b2167374cfd451ff3acee8bdbe0cd9f6c38cce225707f5"},"schema_version":"1.0"},"canonical_sha256":"448fea22ae882a1c918c80234e36ca1079df4f9bbab54c66790f597a203c25fc","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:57:30.349644Z","signature_b64":"FaO18GpTNQWxzxD+24e7blz3vWjVotI4EtqAf3Yov3Y/KFoTYPE4EuDuEK+bTdjVH+vH6DJ+s+4B5TfI/PChDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"448fea22ae882a1c918c80234e36ca1079df4f9bbab54c66790f597a203c25fc","last_reissued_at":"2026-05-17T23:57:30.349019Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:57:30.349019Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1812.09336","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:57:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SmwJ4zJcr0Atrim6Xmxo7E3z6ybF2NXvTde7NsKvaFyvIChAQzkDOQbbcPaB1SRA3ahjGxVCr0rZBzlPZWl2AA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T10:52:56.976972Z"},"content_sha256":"10977b22c3bcd2cea28af294f04fe79de8b3cda6256585836f8f68f19214ff98","schema_version":"1.0","event_id":"sha256:10977b22c3bcd2cea28af294f04fe79de8b3cda6256585836f8f68f19214ff98"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:ISH6UIVORAVBZEMMQARU4NWKCB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"An Empirical Analysis of Deep Audio-Visual Models for Speech Recognition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Devesh Walawalkar, Rohit Pillai, Yihui He","submitted_at":"2018-12-21T19:02:52Z","abstract_excerpt":"In this project, we worked on speech recognition, specifically predicting individual words based on both the video frames and audio. Empowered by convolutional neural networks, the recent speech recognition and lip reading models are comparable to human level performance. We re-implemented and made derivations of the state-of-the-art model. Then, we conducted rich experiments including the effectiveness of attention mechanism, more accurate residual network as the backbone with pre-trained weights and the sensitivity of our model with respect to audio input with/without noise."},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.09336","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:57:30Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HExKuFHRnDdfRO1OpKimwRPYk62aHApJfR9kvHczTsKlYY26UpaI/+Ayo15h3CXg3dKedcEm9ovw2AyLz8NXBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T10:52:56.977611Z"},"content_sha256":"87da5f4416a8ac1a314951b3f0795425fa17a5b2e9cf446a34c47e2a5f8b7522","schema_version":"1.0","event_id":"sha256:87da5f4416a8ac1a314951b3f0795425fa17a5b2e9cf446a34c47e2a5f8b7522"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ISH6UIVORAVBZEMMQARU4NWKCB/bundle.json","state_url":"https://pith.science/pith/ISH6UIVORAVBZEMMQARU4NWKCB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ISH6UIVORAVBZEMMQARU4NWKCB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T10:52:56Z","links":{"resolver":"https://pith.science/pith/ISH6UIVORAVBZEMMQARU4NWKCB","bundle":"https://pith.science/pith/ISH6UIVORAVBZEMMQARU4NWKCB/bundle.json","state":"https://pith.science/pith/ISH6UIVORAVBZEMMQARU4NWKCB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ISH6UIVORAVBZEMMQARU4NWKCB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:ISH6UIVORAVBZEMMQARU4NWKCB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3108904f648c873ff4b2167374cfd451ff3acee8bdbe0cd9f6c38cce225707f5","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-12-21T19:02:52Z","title_canon_sha256":"bc1940423bee4da71c5767a042e6604be730da681ad09662306422e04b729939"},"schema_version":"1.0","source":{"id":"1812.09336","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.09336","created_at":"2026-05-17T23:57:30Z"},{"alias_kind":"arxiv_version","alias_value":"1812.09336v1","created_at":"2026-05-17T23:57:30Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.09336","created_at":"2026-05-17T23:57:30Z"},{"alias_kind":"pith_short_12","alias_value":"ISH6UIVORAVB","created_at":"2026-05-18T12:32:31Z"},{"alias_kind":"pith_short_16","alias_value":"ISH6UIVORAVBZEMM","created_at":"2026-05-18T12:32:31Z"},{"alias_kind":"pith_short_8","alias_value":"ISH6UIVO","created_at":"2026-05-18T12:32:31Z"}],"graph_snapshots":[{"event_id":"sha256:87da5f4416a8ac1a314951b3f0795425fa17a5b2e9cf446a34c47e2a5f8b7522","target":"graph","created_at":"2026-05-17T23:57:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"In this project, we worked on speech recognition, specifically predicting individual words based on both the video frames and audio. Empowered by convolutional neural networks, the recent speech recognition and lip reading models are comparable to human level performance. We re-implemented and made derivations of the state-of-the-art model. Then, we conducted rich experiments including the effectiveness of attention mechanism, more accurate residual network as the backbone with pre-trained weights and the sensitivity of our model with respect to audio input with/without noise.","authors_text":"Devesh Walawalkar, Rohit Pillai, Yihui He","cross_cats":["cs.CL"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-12-21T19:02:52Z","title":"An Empirical Analysis of Deep Audio-Visual Models for Speech Recognition"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.09336","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:10977b22c3bcd2cea28af294f04fe79de8b3cda6256585836f8f68f19214ff98","target":"record","created_at":"2026-05-17T23:57:30Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3108904f648c873ff4b2167374cfd451ff3acee8bdbe0cd9f6c38cce225707f5","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-12-21T19:02:52Z","title_canon_sha256":"bc1940423bee4da71c5767a042e6604be730da681ad09662306422e04b729939"},"schema_version":"1.0","source":{"id":"1812.09336","kind":"arxiv","version":1}},"canonical_sha256":"448fea22ae882a1c918c80234e36ca1079df4f9bbab54c66790f597a203c25fc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"448fea22ae882a1c918c80234e36ca1079df4f9bbab54c66790f597a203c25fc","first_computed_at":"2026-05-17T23:57:30.349019Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:57:30.349019Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"FaO18GpTNQWxzxD+24e7blz3vWjVotI4EtqAf3Yov3Y/KFoTYPE4EuDuEK+bTdjVH+vH6DJ+s+4B5TfI/PChDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:57:30.349644Z","signed_message":"canonical_sha256_bytes"},"source_id":"1812.09336","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:10977b22c3bcd2cea28af294f04fe79de8b3cda6256585836f8f68f19214ff98","sha256:87da5f4416a8ac1a314951b3f0795425fa17a5b2e9cf446a34c47e2a5f8b7522"],"state_sha256":"3aed226e05d474c990fc95743126f31353e4272abfb4eca89ad7a49730af6391"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1MCO1EpvVbp/vaJtN/BZD1+qNn3f0Bz5rfm8O4KiER5DB/vlO10EXedHPPVZsLuhcm/Km2N/+m3a2+zXcg+IBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T10:52:56.980515Z","bundle_sha256":"900a5ed3b349b471e007159b1aa2c44eb6526c8ee481b6cccee98dfeb95430c3"}}