{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2014:AEUK2734DUJKOTA2XFDSKQDVKN","short_pith_number":"pith:AEUK2734","schema_version":"1.0","canonical_sha256":"0128ad7f7c1d12a74c1ab9472540755368167b75d1248d248776e3bfe58e5d89","source":{"kind":"arxiv","id":"1412.7755","version":2},"attestation_state":"computed","paper":{"title":"Multiple Object Recognition with Visual Attention","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.NE"],"primary_cat":"cs.LG","authors_text":"Jimmy Ba, Koray Kavukcuoglu, Volodymyr Mnih","submitted_at":"2014-12-24T20:58:23Z","abstract_excerpt":"We present an attention-based model for recognizing multiple objects in images. The proposed model is a deep recurrent neural network trained with reinforcement learning to attend to the most relevant regions of the input image. We show that the model learns to both localize and recognize multiple objects despite being given only class labels during training. We evaluate the model on the challenging task of transcribing house number sequences from Google Street View images and show that it is both more accurate than the state-of-the-art convolutional networks and uses fewer parameters and less"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1412.7755","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2014-12-24T20:58:23Z","cross_cats_sorted":["cs.CV","cs.NE"],"title_canon_sha256":"45f4563674baaf358ceed5ade5406accc0500b7d98632fcbf3cc340765625f1e","abstract_canon_sha256":"26da5860e4d7c078557b5df506f4b89fbaf053d67f5e1915df70e1017fba41d3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:18:03.661677Z","signature_b64":"01QeyRCe72THb8ynLOz/z8NyQqg8QbgRl2ptwH98jApqgixlFnvNqZFph4o1JVORy7ixrggProOfKgUelRs7BA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0128ad7f7c1d12a74c1ab9472540755368167b75d1248d248776e3bfe58e5d89","last_reissued_at":"2026-05-18T02:18:03.660940Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:18:03.660940Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Multiple Object Recognition with Visual Attention","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.NE"],"primary_cat":"cs.LG","authors_text":"Jimmy Ba, Koray Kavukcuoglu, Volodymyr Mnih","submitted_at":"2014-12-24T20:58:23Z","abstract_excerpt":"We present an attention-based model for recognizing multiple objects in images. The proposed model is a deep recurrent neural network trained with reinforcement learning to attend to the most relevant regions of the input image. We show that the model learns to both localize and recognize multiple objects despite being given only class labels during training. We evaluate the model on the challenging task of transcribing house number sequences from Google Street View images and show that it is both more accurate than the state-of-the-art convolutional networks and uses fewer parameters and less"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1412.7755","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1412.7755","created_at":"2026-05-18T02:18:03.661065+00:00"},{"alias_kind":"arxiv_version","alias_value":"1412.7755v2","created_at":"2026-05-18T02:18:03.661065+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1412.7755","created_at":"2026-05-18T02:18:03.661065+00:00"},{"alias_kind":"pith_short_12","alias_value":"AEUK2734DUJK","created_at":"2026-05-18T12:28:19.803747+00:00"},{"alias_kind":"pith_short_16","alias_value":"AEUK2734DUJKOTA2","created_at":"2026-05-18T12:28:19.803747+00:00"},{"alias_kind":"pith_short_8","alias_value":"AEUK2734","created_at":"2026-05-18T12:28:19.803747+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":5,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"1906.12188","citing_title":"A Deep Decoder Structure Based on WordEmbedding Regression for An Encoder-Decoder Based Model for Image Captioning","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"1907.00664","citing_title":"Learning World Graphs to Accelerate Hierarchical Reinforcement Learning","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"1907.01193","citing_title":"Inverse Attention Guided Deep Crowd Counting Network","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"1907.02136","citing_title":"Learning Blended, Precise Semantic Program Embeddings","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03413","citing_title":"Learning to Theorize the World from Observation","ref_index":249,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN","json":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN.json","graph_json":"https://pith.science/api/pith-number/AEUK2734DUJKOTA2XFDSKQDVKN/graph.json","events_json":"https://pith.science/api/pith-number/AEUK2734DUJKOTA2XFDSKQDVKN/events.json","paper":"https://pith.science/paper/AEUK2734"},"agent_actions":{"view_html":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN","download_json":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN.json","view_paper":"https://pith.science/paper/AEUK2734","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1412.7755&json=true","fetch_graph":"https://pith.science/api/pith-number/AEUK2734DUJKOTA2XFDSKQDVKN/graph.json","fetch_events":"https://pith.science/api/pith-number/AEUK2734DUJKOTA2XFDSKQDVKN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN/action/storage_attestation","attest_author":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN/action/author_attestation","sign_citation":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN/action/citation_signature","submit_replication":"https://pith.science/pith/AEUK2734DUJKOTA2XFDSKQDVKN/action/replication_record"}},"created_at":"2026-05-18T02:18:03.661065+00:00","updated_at":"2026-05-18T02:18:03.661065+00:00"}