{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2015:TULD3PSWZUJLHFZX2W6GWOCC6O","short_pith_number":"pith:TULD3PSW","schema_version":"1.0","canonical_sha256":"9d163dbe56cd12b39737d5bc6b3842f3add2e02e9409a56fe85bd07a91cbfd50","source":{"kind":"arxiv","id":"1504.06063","version":5},"attestation_state":"computed","paper":{"title":"Multimodal Convolutional Neural Networks for Matching Image and Sentence","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.NE"],"primary_cat":"cs.CV","authors_text":"Hang Li, Lifeng Shang, Lin Ma, Zhengdong Lu","submitted_at":"2015-04-23T07:10:13Z","abstract_excerpt":"In this paper, we propose multimodal convolutional neural networks (m-CNNs) for matching image and sentence. Our m-CNN provides an end-to-end framework with convolutional architectures to exploit image representation, word composition, and the matching relations between the two modalities. More specifically, it consists of one image CNN encoding the image content, and one matching CNN learning the joint representation of image and sentence. The matching CNN composes words to different semantic fragments and learns the inter-modal relations between image and the composed fragments at different "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1504.06063","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-04-23T07:10:13Z","cross_cats_sorted":["cs.CL","cs.NE"],"title_canon_sha256":"b622f563e41976869173d0aff26ad7928d9f91854a32e00229344a55fdf5c6d7","abstract_canon_sha256":"7b4587765815b4ec1a411882622eb4251bc38b30f841f1b7e63e48930a48221d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:34:34.794664Z","signature_b64":"7Fx9XflhxWdmc5P1tcfmiFYIDyZ/JEG1BMw8RTqU5Sgo5S5pBfexOTbWh98i8eqi31F4Z99Y+YG433Ip965QDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9d163dbe56cd12b39737d5bc6b3842f3add2e02e9409a56fe85bd07a91cbfd50","last_reissued_at":"2026-05-18T01:34:34.794158Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:34:34.794158Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Multimodal Convolutional Neural Networks for Matching Image and Sentence","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.NE"],"primary_cat":"cs.CV","authors_text":"Hang Li, Lifeng Shang, Lin Ma, Zhengdong Lu","submitted_at":"2015-04-23T07:10:13Z","abstract_excerpt":"In this paper, we propose multimodal convolutional neural networks (m-CNNs) for matching image and sentence. Our m-CNN provides an end-to-end framework with convolutional architectures to exploit image representation, word composition, and the matching relations between the two modalities. More specifically, it consists of one image CNN encoding the image content, and one matching CNN learning the joint representation of image and sentence. The matching CNN composes words to different semantic fragments and learns the inter-modal relations between image and the composed fragments at different "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1504.06063","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1504.06063","created_at":"2026-05-18T01:34:34.794237+00:00"},{"alias_kind":"arxiv_version","alias_value":"1504.06063v5","created_at":"2026-05-18T01:34:34.794237+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1504.06063","created_at":"2026-05-18T01:34:34.794237+00:00"},{"alias_kind":"pith_short_12","alias_value":"TULD3PSWZUJL","created_at":"2026-05-18T12:29:42.218222+00:00"},{"alias_kind":"pith_short_16","alias_value":"TULD3PSWZUJLHFZX","created_at":"2026-05-18T12:29:42.218222+00:00"},{"alias_kind":"pith_short_8","alias_value":"TULD3PSW","created_at":"2026-05-18T12:29:42.218222+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O","json":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O.json","graph_json":"https://pith.science/api/pith-number/TULD3PSWZUJLHFZX2W6GWOCC6O/graph.json","events_json":"https://pith.science/api/pith-number/TULD3PSWZUJLHFZX2W6GWOCC6O/events.json","paper":"https://pith.science/paper/TULD3PSW"},"agent_actions":{"view_html":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O","download_json":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O.json","view_paper":"https://pith.science/paper/TULD3PSW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1504.06063&json=true","fetch_graph":"https://pith.science/api/pith-number/TULD3PSWZUJLHFZX2W6GWOCC6O/graph.json","fetch_events":"https://pith.science/api/pith-number/TULD3PSWZUJLHFZX2W6GWOCC6O/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O/action/storage_attestation","attest_author":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O/action/author_attestation","sign_citation":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O/action/citation_signature","submit_replication":"https://pith.science/pith/TULD3PSWZUJLHFZX2W6GWOCC6O/action/replication_record"}},"created_at":"2026-05-18T01:34:34.794237+00:00","updated_at":"2026-05-18T01:34:34.794237+00:00"}