{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2018:74JXOMBRDFLRVE6J2WVLF7W43U","short_pith_number":"pith:74JXOMBR","canonical_record":{"source":{"id":"1812.02872","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2018-12-07T01:57:42Z","cross_cats_sorted":[],"title_canon_sha256":"6367fbbe99512168142e4090cedd9d5c95a403b38c414a04feb1d57126397415","abstract_canon_sha256":"48c36e4980192c95a9d4b7d25b38dbe5052db556b31a32797f2d7b339f5c9299"},"schema_version":"1.0"},"canonical_sha256":"ff1377303119571a93c9d5aab2fedcdd2ec0ac07362228adb6f568575437faa5","source":{"kind":"arxiv","id":"1812.02872","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.02872","created_at":"2026-05-17T23:58:51Z"},{"alias_kind":"arxiv_version","alias_value":"1812.02872v1","created_at":"2026-05-17T23:58:51Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.02872","created_at":"2026-05-17T23:58:51Z"},{"alias_kind":"pith_short_12","alias_value":"74JXOMBRDFLR","created_at":"2026-05-18T12:32:11Z"},{"alias_kind":"pith_short_16","alias_value":"74JXOMBRDFLRVE6J","created_at":"2026-05-18T12:32:11Z"},{"alias_kind":"pith_short_8","alias_value":"74JXOMBR","created_at":"2026-05-18T12:32:11Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2018:74JXOMBRDFLRVE6J2WVLF7W43U","target":"record","payload":{"canonical_record":{"source":{"id":"1812.02872","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2018-12-07T01:57:42Z","cross_cats_sorted":[],"title_canon_sha256":"6367fbbe99512168142e4090cedd9d5c95a403b38c414a04feb1d57126397415","abstract_canon_sha256":"48c36e4980192c95a9d4b7d25b38dbe5052db556b31a32797f2d7b339f5c9299"},"schema_version":"1.0"},"canonical_sha256":"ff1377303119571a93c9d5aab2fedcdd2ec0ac07362228adb6f568575437faa5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:58:51.589790Z","signature_b64":"yXDl+gBgH5cbb1zu7oObLh+gGRG9hl5QoLZUX+J04FZvPli0zoeBz0IGpG8nLF0BfsSf4eWQxOCJ6kyecteqCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ff1377303119571a93c9d5aab2fedcdd2ec0ac07362228adb6f568575437faa5","last_reissued_at":"2026-05-17T23:58:51.589234Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:58:51.589234Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1812.02872","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:58:51Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I0NjtxHlCE7bkYfaQgYxhJ1AdeT4D6vHvok+8RixTwzz1OWgzn3W9P3rwIPlTv78EDJ8xuwYY8GaiSZn0NzuAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T10:09:55.361457Z"},"content_sha256":"eda345797cc9459ee5df7af34b82ac7f8396c38502e73751ddb650c155e94ef6","schema_version":"1.0","event_id":"sha256:eda345797cc9459ee5df7af34b82ac7f8396c38502e73751ddb650c155e94ef6"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2018:74JXOMBRDFLRVE6J2WVLF7W43U","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"An Attempt towards Interpretable Audio-Visual Video Captioning","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chenliang Xu, Chenxiao Guan, Justin Goodman, Marc Moore, Yapeng Tian","submitted_at":"2018-12-07T01:57:42Z","abstract_excerpt":"Automatically generating a natural language sentence to describe the content of an input video is a very challenging problem. It is an essential multimodal task in which auditory and visual contents are equally important. Although audio information has been exploited to improve video captioning in previous works, it is usually regarded as an additional feature fed into a black box fusion machine. How are the words in the generated sentences associated with the auditory and visual modalities? The problem is still not investigated. In this paper, we make the first attempt to design an interpreta"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.02872","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:58:51Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bqPyHkxTnxanhAKDU6uOTx2YGMWdGjKAx3eMRYm0z0ZgQpEZlbJU/N9DbCeqgn3dzrHg8vuy0uSmBXc7VBvCBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T10:09:55.361971Z"},"content_sha256":"7408004bd6b395490e993cba75bec727e1f53fa9a7a4598d112d80f40b826966","schema_version":"1.0","event_id":"sha256:7408004bd6b395490e993cba75bec727e1f53fa9a7a4598d112d80f40b826966"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/74JXOMBRDFLRVE6J2WVLF7W43U/bundle.json","state_url":"https://pith.science/pith/74JXOMBRDFLRVE6J2WVLF7W43U/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/74JXOMBRDFLRVE6J2WVLF7W43U/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T10:09:55Z","links":{"resolver":"https://pith.science/pith/74JXOMBRDFLRVE6J2WVLF7W43U","bundle":"https://pith.science/pith/74JXOMBRDFLRVE6J2WVLF7W43U/bundle.json","state":"https://pith.science/pith/74JXOMBRDFLRVE6J2WVLF7W43U/state.json","well_known_bundle":"https://pith.science/.well-known/pith/74JXOMBRDFLRVE6J2WVLF7W43U/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2018:74JXOMBRDFLRVE6J2WVLF7W43U","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"48c36e4980192c95a9d4b7d25b38dbe5052db556b31a32797f2d7b339f5c9299","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2018-12-07T01:57:42Z","title_canon_sha256":"6367fbbe99512168142e4090cedd9d5c95a403b38c414a04feb1d57126397415"},"schema_version":"1.0","source":{"id":"1812.02872","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1812.02872","created_at":"2026-05-17T23:58:51Z"},{"alias_kind":"arxiv_version","alias_value":"1812.02872v1","created_at":"2026-05-17T23:58:51Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1812.02872","created_at":"2026-05-17T23:58:51Z"},{"alias_kind":"pith_short_12","alias_value":"74JXOMBRDFLR","created_at":"2026-05-18T12:32:11Z"},{"alias_kind":"pith_short_16","alias_value":"74JXOMBRDFLRVE6J","created_at":"2026-05-18T12:32:11Z"},{"alias_kind":"pith_short_8","alias_value":"74JXOMBR","created_at":"2026-05-18T12:32:11Z"}],"graph_snapshots":[{"event_id":"sha256:7408004bd6b395490e993cba75bec727e1f53fa9a7a4598d112d80f40b826966","target":"graph","created_at":"2026-05-17T23:58:51Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Automatically generating a natural language sentence to describe the content of an input video is a very challenging problem. It is an essential multimodal task in which auditory and visual contents are equally important. Although audio information has been exploited to improve video captioning in previous works, it is usually regarded as an additional feature fed into a black box fusion machine. How are the words in the generated sentences associated with the auditory and visual modalities? The problem is still not investigated. In this paper, we make the first attempt to design an interpreta","authors_text":"Chenliang Xu, Chenxiao Guan, Justin Goodman, Marc Moore, Yapeng Tian","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2018-12-07T01:57:42Z","title":"An Attempt towards Interpretable Audio-Visual Video Captioning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1812.02872","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:eda345797cc9459ee5df7af34b82ac7f8396c38502e73751ddb650c155e94ef6","target":"record","created_at":"2026-05-17T23:58:51Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"48c36e4980192c95a9d4b7d25b38dbe5052db556b31a32797f2d7b339f5c9299","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2018-12-07T01:57:42Z","title_canon_sha256":"6367fbbe99512168142e4090cedd9d5c95a403b38c414a04feb1d57126397415"},"schema_version":"1.0","source":{"id":"1812.02872","kind":"arxiv","version":1}},"canonical_sha256":"ff1377303119571a93c9d5aab2fedcdd2ec0ac07362228adb6f568575437faa5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ff1377303119571a93c9d5aab2fedcdd2ec0ac07362228adb6f568575437faa5","first_computed_at":"2026-05-17T23:58:51.589234Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:58:51.589234Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"yXDl+gBgH5cbb1zu7oObLh+gGRG9hl5QoLZUX+J04FZvPli0zoeBz0IGpG8nLF0BfsSf4eWQxOCJ6kyecteqCg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:58:51.589790Z","signed_message":"canonical_sha256_bytes"},"source_id":"1812.02872","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:eda345797cc9459ee5df7af34b82ac7f8396c38502e73751ddb650c155e94ef6","sha256:7408004bd6b395490e993cba75bec727e1f53fa9a7a4598d112d80f40b826966"],"state_sha256":"828ee62a9bc2924dfd4808e927112c8adcd6b3596631c44d92c0ffbbc6f95a9f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sObXP2lPgsyrK1B/gKQwMwQF+EyVdWMvxu2I24bGC+iX50z0152Fstk+dmy94SMb9anH+As8AwLhFf3pKIlADQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T10:09:55.365574Z","bundle_sha256":"1ce5c6474ab62ebfdf6ddbbef9191f83211b08191a71d4dab7a9e280e9c83fb1"}}