{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2016:4S2ZHFU5EMDXIJCD5Z5GZWRPZK","short_pith_number":"pith:4S2ZHFU5","schema_version":"1.0","canonical_sha256":"e4b593969d2307742443ee7a6cda2fca815719f711014f90499fe65c0b3b7e7b","source":{"kind":"arxiv","id":"1606.01847","version":3},"attestation_state":"computed","paper":{"title":"Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Akira Fukui, Anna Rohrbach, Daylen Yang, Dong Huk Park, Marcus Rohrbach, Trevor Darrell","submitted_at":"2016-06-06T17:59:56Z","abstract_excerpt":"Modeling textual or visual information with vector representations trained from large language or visual datasets has been successfully explored in recent years. However, tasks such as visual question answering require combining these vector representations with each other. Approaches to multimodal pooling include element-wise product or sum, as well as concatenation of the visual and textual representations. We hypothesize that these methods are not as expressive as an outer product of the visual and textual vectors. As the outer product is typically infeasible due to its high dimensionality,"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1606.01847","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2016-06-06T17:59:56Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"0f2ef823ee715fe2134df9d71899d99fc7bb9dee667f38f58a58b0eac435dfd4","abstract_canon_sha256":"a477140e8f312b07325b5f805786bc6194caf15f0832cf595c26168869258bbf"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:03:58.850165Z","signature_b64":"jZEfVOGUqfeSwShOuebQRYy2BO21W9viIW1pZd5uTZE6WEGN8hUoGUkNgcOq4EWdpWtU0xZnYy5Z4yVqn6VZAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e4b593969d2307742443ee7a6cda2fca815719f711014f90499fe65c0b3b7e7b","last_reissued_at":"2026-05-18T01:03:58.849434Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:03:58.849434Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Akira Fukui, Anna Rohrbach, Daylen Yang, Dong Huk Park, Marcus Rohrbach, Trevor Darrell","submitted_at":"2016-06-06T17:59:56Z","abstract_excerpt":"Modeling textual or visual information with vector representations trained from large language or visual datasets has been successfully explored in recent years. However, tasks such as visual question answering require combining these vector representations with each other. Approaches to multimodal pooling include element-wise product or sum, as well as concatenation of the visual and textual representations. We hypothesize that these methods are not as expressive as an outer product of the visual and textual vectors. As the outer product is typically infeasible due to its high dimensionality,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1606.01847","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1606.01847","created_at":"2026-05-18T01:03:58.849557+00:00"},{"alias_kind":"arxiv_version","alias_value":"1606.01847v3","created_at":"2026-05-18T01:03:58.849557+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1606.01847","created_at":"2026-05-18T01:03:58.849557+00:00"},{"alias_kind":"pith_short_12","alias_value":"4S2ZHFU5EMDX","created_at":"2026-05-18T12:29:58.707656+00:00"},{"alias_kind":"pith_short_16","alias_value":"4S2ZHFU5EMDXIJCD","created_at":"2026-05-18T12:29:58.707656+00:00"},{"alias_kind":"pith_short_8","alias_value":"4S2ZHFU5","created_at":"2026-05-18T12:29:58.707656+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"1906.10770","citing_title":"Deep Modular Co-Attention Networks for Visual Question Answering","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"1907.02665","citing_title":"Blind Image Quality Assessment Using A Deep Bilinear Convolutional Neural Network","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2003.10286","citing_title":"PathVQA: 30000+ Questions for Medical Visual Question Answering","ref_index":23,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK","json":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK.json","graph_json":"https://pith.science/api/pith-number/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/graph.json","events_json":"https://pith.science/api/pith-number/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/events.json","paper":"https://pith.science/paper/4S2ZHFU5"},"agent_actions":{"view_html":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK","download_json":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK.json","view_paper":"https://pith.science/paper/4S2ZHFU5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1606.01847&json=true","fetch_graph":"https://pith.science/api/pith-number/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/graph.json","fetch_events":"https://pith.science/api/pith-number/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/action/storage_attestation","attest_author":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/action/author_attestation","sign_citation":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/action/citation_signature","submit_replication":"https://pith.science/pith/4S2ZHFU5EMDXIJCD5Z5GZWRPZK/action/replication_record"}},"created_at":"2026-05-18T01:03:58.849557+00:00","updated_at":"2026-05-18T01:03:58.849557+00:00"}