{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2015:YM7SLPAS55V52VEWGYRASKSASL","short_pith_number":"pith:YM7SLPAS","schema_version":"1.0","canonical_sha256":"c33f25bc12ef6bdd54963622092a4092e002847a431e16fa6a914edf7de232fa","source":{"kind":"arxiv","id":"1511.08228","version":3},"attestation_state":"computed","paper":{"title":"Neural GPUs Learn Algorithms","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.NE"],"primary_cat":"cs.LG","authors_text":"Ilya Sutskever, {\\L}ukasz Kaiser","submitted_at":"2015-11-25T21:17:43Z","abstract_excerpt":"Learning an algorithm from examples is a fundamental problem that has been widely studied. Recently it has been addressed using neural networks, in particular by Neural Turing Machines (NTMs). These are fully differentiable computers that use backpropagation to learn their own programming. Despite their appeal NTMs have a weakness that is caused by their sequential nature: they are not parallel and are are hard to train due to their large depth when unfolded.\n  We present a neural network architecture to address this problem: the Neural GPU. It is based on a type of convolutional gated recurre"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1511.08228","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2015-11-25T21:17:43Z","cross_cats_sorted":["cs.NE"],"title_canon_sha256":"c64a500b0abfc121837ba76066388ea47fdcf1872d472233f776f547884c8c7f","abstract_canon_sha256":"f20d600fc239b8b1e14473c8a67830ddf0ff89df548329595237be104e7fdbf6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:19:05.483310Z","signature_b64":"9j7XIxBFdqTtB51tgfV5SNMDl/CNXs4SuJQBXAC9q1v2J2ij4/i1RAgDuV8JXWzme84m+OX0T7BIaCkveYLYCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c33f25bc12ef6bdd54963622092a4092e002847a431e16fa6a914edf7de232fa","last_reissued_at":"2026-05-18T01:19:05.482763Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:19:05.482763Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Neural GPUs Learn Algorithms","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.NE"],"primary_cat":"cs.LG","authors_text":"Ilya Sutskever, {\\L}ukasz Kaiser","submitted_at":"2015-11-25T21:17:43Z","abstract_excerpt":"Learning an algorithm from examples is a fundamental problem that has been widely studied. Recently it has been addressed using neural networks, in particular by Neural Turing Machines (NTMs). These are fully differentiable computers that use backpropagation to learn their own programming. Despite their appeal NTMs have a weakness that is caused by their sequential nature: they are not parallel and are are hard to train due to their large depth when unfolded.\n  We present a neural network architecture to address this problem: the Neural GPU. It is based on a type of convolutional gated recurre"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1511.08228","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1511.08228","created_at":"2026-05-18T01:19:05.482822+00:00"},{"alias_kind":"arxiv_version","alias_value":"1511.08228v3","created_at":"2026-05-18T01:19:05.482822+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1511.08228","created_at":"2026-05-18T01:19:05.482822+00:00"},{"alias_kind":"pith_short_12","alias_value":"YM7SLPAS55V5","created_at":"2026-05-18T12:29:50.041715+00:00"},{"alias_kind":"pith_short_16","alias_value":"YM7SLPAS55V52VEW","created_at":"2026-05-18T12:29:50.041715+00:00"},{"alias_kind":"pith_short_8","alias_value":"YM7SLPAS","created_at":"2026-05-18T12:29:50.041715+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":9,"internal_anchor_count":5,"sample":[{"citing_arxiv_id":"2605.23395","citing_title":"Convex Compositional Reasoning Models","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2504.16155","citing_title":"PRIMETIME : Limits of LLMs in Temporal Primitives","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2404.07143","citing_title":"Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"1909.01066","citing_title":"Language Models as Knowledge Bases?","ref_index":236,"is_internal_anchor":true},{"citing_arxiv_id":"2602.01651","citing_title":"On the Spatiotemporal Dynamics of Generalization in Neural Networks","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"1807.03819","citing_title":"Universal Transformers","ref_index":15,"is_internal_anchor":false},{"citing_arxiv_id":"2104.13478","citing_title":"Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges","ref_index":41,"is_internal_anchor":false},{"citing_arxiv_id":"2201.02177","citing_title":"Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets","ref_index":7,"is_internal_anchor":false},{"citing_arxiv_id":"1606.06565","citing_title":"Concrete Problems in AI Safety","ref_index":80,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL","json":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL.json","graph_json":"https://pith.science/api/pith-number/YM7SLPAS55V52VEWGYRASKSASL/graph.json","events_json":"https://pith.science/api/pith-number/YM7SLPAS55V52VEWGYRASKSASL/events.json","paper":"https://pith.science/paper/YM7SLPAS"},"agent_actions":{"view_html":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL","download_json":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL.json","view_paper":"https://pith.science/paper/YM7SLPAS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1511.08228&json=true","fetch_graph":"https://pith.science/api/pith-number/YM7SLPAS55V52VEWGYRASKSASL/graph.json","fetch_events":"https://pith.science/api/pith-number/YM7SLPAS55V52VEWGYRASKSASL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL/action/storage_attestation","attest_author":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL/action/author_attestation","sign_citation":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL/action/citation_signature","submit_replication":"https://pith.science/pith/YM7SLPAS55V52VEWGYRASKSASL/action/replication_record"}},"created_at":"2026-05-18T01:19:05.482822+00:00","updated_at":"2026-05-18T01:19:05.482822+00:00"}