{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:ZMW73WRQVPO4LNHMTTASZVCOL2","short_pith_number":"pith:ZMW73WRQ","schema_version":"1.0","canonical_sha256":"cb2dfdda30abddc5b4ec9cc12cd44e5ea9a3352c8935254a3e06e58fe817e339","source":{"kind":"arxiv","id":"1709.07992","version":3},"attestation_state":"computed","paper":{"title":"Visual Reference Resolution using Attention Memory for Visual Dialog","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andreas Lehrmann, Bohyung Han, Leonid Sigal, Paul Hongsuck Seo","submitted_at":"2017-09-23T02:53:48Z","abstract_excerpt":"Visual dialog is a task of answering a series of inter-dependent questions given an input image, and often requires to resolve visual references among the questions. This problem is different from visual question answering (VQA), which relies on spatial attention (a.k.a. visual grounding) estimated from an image and question pair. We propose a novel attention mechanism that exploits visual attentions in the past to resolve the current reference in the visual dialog scenario. The proposed model is equipped with an associative attention memory storing a sequence of previous (attention, key) pair"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1709.07992","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-09-23T02:53:48Z","cross_cats_sorted":[],"title_canon_sha256":"9f6bb4ad5b49bcad50706dee702a41ae5d6aeb00110c1ae4208660082d63884e","abstract_canon_sha256":"79ace571d6361786697a0bf8f09a7ac8dc8a101e6898a0bfd3ac698302e9e402"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:08:48.458538Z","signature_b64":"oEnIfLaCkkVHa0gj4BaHIIVbzGGkRBMmlS2g61pUD9jLoLvydVNtun5VMifHr2kt+6jjBjv0obegqM1xRE/VCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"cb2dfdda30abddc5b4ec9cc12cd44e5ea9a3352c8935254a3e06e58fe817e339","last_reissued_at":"2026-05-18T00:08:48.457992Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:08:48.457992Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Visual Reference Resolution using Attention Memory for Visual Dialog","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Andreas Lehrmann, Bohyung Han, Leonid Sigal, Paul Hongsuck Seo","submitted_at":"2017-09-23T02:53:48Z","abstract_excerpt":"Visual dialog is a task of answering a series of inter-dependent questions given an input image, and often requires to resolve visual references among the questions. This problem is different from visual question answering (VQA), which relies on spatial attention (a.k.a. visual grounding) estimated from an image and question pair. We propose a novel attention mechanism that exploits visual attentions in the past to resolve the current reference in the visual dialog scenario. The proposed model is equipped with an associative attention memory storing a sequence of previous (attention, key) pair"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1709.07992","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1709.07992","created_at":"2026-05-18T00:08:48.458069+00:00"},{"alias_kind":"arxiv_version","alias_value":"1709.07992v3","created_at":"2026-05-18T00:08:48.458069+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1709.07992","created_at":"2026-05-18T00:08:48.458069+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZMW73WRQVPO4","created_at":"2026-05-18T12:31:59.375834+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZMW73WRQVPO4LNHM","created_at":"2026-05-18T12:31:59.375834+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZMW73WRQ","created_at":"2026-05-18T12:31:59.375834+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2","json":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2.json","graph_json":"https://pith.science/api/pith-number/ZMW73WRQVPO4LNHMTTASZVCOL2/graph.json","events_json":"https://pith.science/api/pith-number/ZMW73WRQVPO4LNHMTTASZVCOL2/events.json","paper":"https://pith.science/paper/ZMW73WRQ"},"agent_actions":{"view_html":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2","download_json":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2.json","view_paper":"https://pith.science/paper/ZMW73WRQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1709.07992&json=true","fetch_graph":"https://pith.science/api/pith-number/ZMW73WRQVPO4LNHMTTASZVCOL2/graph.json","fetch_events":"https://pith.science/api/pith-number/ZMW73WRQVPO4LNHMTTASZVCOL2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2/action/storage_attestation","attest_author":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2/action/author_attestation","sign_citation":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2/action/citation_signature","submit_replication":"https://pith.science/pith/ZMW73WRQVPO4LNHMTTASZVCOL2/action/replication_record"}},"created_at":"2026-05-18T00:08:48.458069+00:00","updated_at":"2026-05-18T00:08:48.458069+00:00"}