{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:6FYMXSF6YZTVDGBD73OPRXY4QL","short_pith_number":"pith:6FYMXSF6","canonical_record":{"source":{"id":"1811.03865","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-09T11:30:11Z","cross_cats_sorted":[],"title_canon_sha256":"2900b0af2b27a3dc583bc6de8cc2a385c4ef740bce451211a1107b9e9ae7eaff","abstract_canon_sha256":"d825bb250e22245aac21a887057ffaafd88c357b15d608ea6f61d2244b797d42"},"schema_version":"1.0"},"canonical_sha256":"f170cbc8bec667519823fedcf8df1c82ca3a2d1cd3adc92fd5a645212705fdf6","source":{"kind":"arxiv","id":"1811.03865","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1811.03865","created_at":"2026-05-17T23:53:09Z"},{"alias_kind":"arxiv_version","alias_value":"1811.03865v2","created_at":"2026-05-17T23:53:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.03865","created_at":"2026-05-17T23:53:09Z"},{"alias_kind":"pith_short_12","alias_value":"6FYMXSF6YZTV","created_at":"2026-05-18T12:32:08Z"},{"alias_kind":"pith_short_16","alias_value":"6FYMXSF6YZTVDGBD","created_at":"2026-05-18T12:32:08Z"},{"alias_kind":"pith_short_8","alias_value":"6FYMXSF6","created_at":"2026-05-18T12:32:08Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:6FYMXSF6YZTVDGBD73OPRXY4QL","target":"record","payload":{"canonical_record":{"source":{"id":"1811.03865","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-09T11:30:11Z","cross_cats_sorted":[],"title_canon_sha256":"2900b0af2b27a3dc583bc6de8cc2a385c4ef740bce451211a1107b9e9ae7eaff","abstract_canon_sha256":"d825bb250e22245aac21a887057ffaafd88c357b15d608ea6f61d2244b797d42"},"schema_version":"1.0"},"canonical_sha256":"f170cbc8bec667519823fedcf8df1c82ca3a2d1cd3adc92fd5a645212705fdf6","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:53:09.030454Z","signature_b64":"JrmAzMtSWy6mcoxO/oDWEOyvRBN2RoK1/FNcu59kOvB5Ku/05+QV6gpR4cHjEF6RkbsgkJFwimpMwv0jozQlDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f170cbc8bec667519823fedcf8df1c82ca3a2d1cd3adc92fd5a645212705fdf6","last_reissued_at":"2026-05-17T23:53:09.029910Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:53:09.029910Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1811.03865","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:53:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"36RBAR7NKiRyYiS1QVS6xlLrTKYlIF0EwPO2w6uiVqvgP7pI1L61mbwLvM++z5KijN3Z82HPvEbbgqcKgqwtBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T08:11:26.037249Z"},"content_sha256":"73d1f23d0899c25643486393a9f4d85f4b9cacc18fa06a88b7a47f0263de88b6","schema_version":"1.0","event_id":"sha256:73d1f23d0899c25643486393a9f4d85f4b9cacc18fa06a88b7a47f0263de88b6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:6FYMXSF6YZTVDGBD73OPRXY4QL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Multimodal Grounding for Sequence-to-Sequence Speech Recognition","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Florian Metze, Lo\\\"ic Barrault, Ozan Caglayan, Ramon Sanabria, Shruti Palaskar","submitted_at":"2018-11-09T11:30:11Z","abstract_excerpt":"Humans are capable of processing speech by making use of multiple sensory modalities. For example, the environment where a conversation takes place generally provides semantic and/or acoustic context that helps us to resolve ambiguities or to recall named entities. Motivated by this, there have been many works studying the integration of visual information into the speech recognition pipeline. Specifically, in our previous work, we propose a multistep visual adaptive training approach which improves the accuracy of an audio-based Automatic Speech Recognition (ASR) system. This approach, howeve"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.03865","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:53:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"eUjuWU4x0QWAYFoeAsl/D/PIGE3G7ODdvnOZ+2Vx7Z6OZfFiiJDPhXGEaOfjzjOvRdbBY025sMs+awyPnYxNBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T08:11:26.037939Z"},"content_sha256":"0a7c0bd7bc782976b5bc2dfa0d6d8907189fb7c456ecbafe24e7cde18c12aa84","schema_version":"1.0","event_id":"sha256:0a7c0bd7bc782976b5bc2dfa0d6d8907189fb7c456ecbafe24e7cde18c12aa84"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/6FYMXSF6YZTVDGBD73OPRXY4QL/bundle.json","state_url":"https://pith.science/pith/6FYMXSF6YZTVDGBD73OPRXY4QL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/6FYMXSF6YZTVDGBD73OPRXY4QL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T08:11:26Z","links":{"resolver":"https://pith.science/pith/6FYMXSF6YZTVDGBD73OPRXY4QL","bundle":"https://pith.science/pith/6FYMXSF6YZTVDGBD73OPRXY4QL/bundle.json","state":"https://pith.science/pith/6FYMXSF6YZTVDGBD73OPRXY4QL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/6FYMXSF6YZTVDGBD73OPRXY4QL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:6FYMXSF6YZTVDGBD73OPRXY4QL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d825bb250e22245aac21a887057ffaafd88c357b15d608ea6f61d2244b797d42","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-09T11:30:11Z","title_canon_sha256":"2900b0af2b27a3dc583bc6de8cc2a385c4ef740bce451211a1107b9e9ae7eaff"},"schema_version":"1.0","source":{"id":"1811.03865","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1811.03865","created_at":"2026-05-17T23:53:09Z"},{"alias_kind":"arxiv_version","alias_value":"1811.03865v2","created_at":"2026-05-17T23:53:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.03865","created_at":"2026-05-17T23:53:09Z"},{"alias_kind":"pith_short_12","alias_value":"6FYMXSF6YZTV","created_at":"2026-05-18T12:32:08Z"},{"alias_kind":"pith_short_16","alias_value":"6FYMXSF6YZTVDGBD","created_at":"2026-05-18T12:32:08Z"},{"alias_kind":"pith_short_8","alias_value":"6FYMXSF6","created_at":"2026-05-18T12:32:08Z"}],"graph_snapshots":[{"event_id":"sha256:0a7c0bd7bc782976b5bc2dfa0d6d8907189fb7c456ecbafe24e7cde18c12aa84","target":"graph","created_at":"2026-05-17T23:53:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Humans are capable of processing speech by making use of multiple sensory modalities. For example, the environment where a conversation takes place generally provides semantic and/or acoustic context that helps us to resolve ambiguities or to recall named entities. Motivated by this, there have been many works studying the integration of visual information into the speech recognition pipeline. Specifically, in our previous work, we propose a multistep visual adaptive training approach which improves the accuracy of an audio-based Automatic Speech Recognition (ASR) system. This approach, howeve","authors_text":"Florian Metze, Lo\\\"ic Barrault, Ozan Caglayan, Ramon Sanabria, Shruti Palaskar","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-09T11:30:11Z","title":"Multimodal Grounding for Sequence-to-Sequence Speech Recognition"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.03865","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:73d1f23d0899c25643486393a9f4d85f4b9cacc18fa06a88b7a47f0263de88b6","target":"record","created_at":"2026-05-17T23:53:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d825bb250e22245aac21a887057ffaafd88c357b15d608ea6f61d2244b797d42","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-09T11:30:11Z","title_canon_sha256":"2900b0af2b27a3dc583bc6de8cc2a385c4ef740bce451211a1107b9e9ae7eaff"},"schema_version":"1.0","source":{"id":"1811.03865","kind":"arxiv","version":2}},"canonical_sha256":"f170cbc8bec667519823fedcf8df1c82ca3a2d1cd3adc92fd5a645212705fdf6","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f170cbc8bec667519823fedcf8df1c82ca3a2d1cd3adc92fd5a645212705fdf6","first_computed_at":"2026-05-17T23:53:09.029910Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:53:09.029910Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"JrmAzMtSWy6mcoxO/oDWEOyvRBN2RoK1/FNcu59kOvB5Ku/05+QV6gpR4cHjEF6RkbsgkJFwimpMwv0jozQlDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:53:09.030454Z","signed_message":"canonical_sha256_bytes"},"source_id":"1811.03865","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:73d1f23d0899c25643486393a9f4d85f4b9cacc18fa06a88b7a47f0263de88b6","sha256:0a7c0bd7bc782976b5bc2dfa0d6d8907189fb7c456ecbafe24e7cde18c12aa84"],"state_sha256":"489ff7c94009f75474b57fa3061cdec4cc99a66d499ebe2f761bafb3b91e4327"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"iKnyBSPOFoPqiWUnb1HvG5FcGqc0GUqC1G1NSPvzyUOKuT0HIGGJU5/xaU8dsam7PvGCMDg6SlDdSUXyjepcDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T08:11:26.041831Z","bundle_sha256":"4efbe1f475db1ef42a878041aa88991ff84ded4548d304dbae0bbe14037df3a6"}}