{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:7NYEQ4SHJQT5THW3IZH4TJPMHH","short_pith_number":"pith:7NYEQ4SH","schema_version":"1.0","canonical_sha256":"fb704872474c27d99edb464fc9a5ec39eea42ee71d132842f7656656006f944b","source":{"kind":"arxiv","id":"1611.05592","version":1},"attestation_state":"computed","paper":{"title":"Multimodal Memory Modelling for Video Captioning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Junbo Wang, Liang Wang, Tieniu Tan, Wei Wang, Yan Huang","submitted_at":"2016-11-17T07:24:03Z","abstract_excerpt":"Video captioning which automatically translates video clips into natural language sentences is a very important task in computer vision. By virtue of recent deep learning technologies, e.g., convolutional neural networks (CNNs) and recurrent neural networks (RNNs), video captioning has made great progress. However, learning an effective mapping from visual sequence space to language space is still a challenging problem. In this paper, we propose a Multimodal Memory Model (M3) to describe videos, which builds a visual and textual shared memory to model the long-term visual-textual dependency an"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1611.05592","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2016-11-17T07:24:03Z","cross_cats_sorted":[],"title_canon_sha256":"ea08a718d1d9e9462b3c6db90f84860bd73e050d19f849a87ca77478f2a88fe6","abstract_canon_sha256":"adc03e0f33d5fcde382d6a8d0c6255da41e308736f854e155e2fd862c986a2ff"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:58:03.179031Z","signature_b64":"rwDKmUD+aQ9PDMuHeO80BmfguHsqgVKYnEf37xOVK5DsiCoDVnWXf4w1XnlUA9h879DdeGgXrdOKwnHzZoGjAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fb704872474c27d99edb464fc9a5ec39eea42ee71d132842f7656656006f944b","last_reissued_at":"2026-05-18T00:58:03.178570Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:58:03.178570Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Multimodal Memory Modelling for Video Captioning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Junbo Wang, Liang Wang, Tieniu Tan, Wei Wang, Yan Huang","submitted_at":"2016-11-17T07:24:03Z","abstract_excerpt":"Video captioning which automatically translates video clips into natural language sentences is a very important task in computer vision. By virtue of recent deep learning technologies, e.g., convolutional neural networks (CNNs) and recurrent neural networks (RNNs), video captioning has made great progress. However, learning an effective mapping from visual sequence space to language space is still a challenging problem. In this paper, we propose a Multimodal Memory Model (M3) to describe videos, which builds a visual and textual shared memory to model the long-term visual-textual dependency an"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1611.05592","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1611.05592","created_at":"2026-05-18T00:58:03.178645+00:00"},{"alias_kind":"arxiv_version","alias_value":"1611.05592v1","created_at":"2026-05-18T00:58:03.178645+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1611.05592","created_at":"2026-05-18T00:58:03.178645+00:00"},{"alias_kind":"pith_short_12","alias_value":"7NYEQ4SHJQT5","created_at":"2026-05-18T12:30:04.600751+00:00"},{"alias_kind":"pith_short_16","alias_value":"7NYEQ4SHJQT5THW3","created_at":"2026-05-18T12:30:04.600751+00:00"},{"alias_kind":"pith_short_8","alias_value":"7NYEQ4SH","created_at":"2026-05-18T12:30:04.600751+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH","json":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH.json","graph_json":"https://pith.science/api/pith-number/7NYEQ4SHJQT5THW3IZH4TJPMHH/graph.json","events_json":"https://pith.science/api/pith-number/7NYEQ4SHJQT5THW3IZH4TJPMHH/events.json","paper":"https://pith.science/paper/7NYEQ4SH"},"agent_actions":{"view_html":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH","download_json":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH.json","view_paper":"https://pith.science/paper/7NYEQ4SH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1611.05592&json=true","fetch_graph":"https://pith.science/api/pith-number/7NYEQ4SHJQT5THW3IZH4TJPMHH/graph.json","fetch_events":"https://pith.science/api/pith-number/7NYEQ4SHJQT5THW3IZH4TJPMHH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH/action/storage_attestation","attest_author":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH/action/author_attestation","sign_citation":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH/action/citation_signature","submit_replication":"https://pith.science/pith/7NYEQ4SHJQT5THW3IZH4TJPMHH/action/replication_record"}},"created_at":"2026-05-18T00:58:03.178645+00:00","updated_at":"2026-05-18T00:58:03.178645+00:00"}