{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:WCHBCUAJDPJA3BI2DVHH5U5GUG","short_pith_number":"pith:WCHBCUAJ","canonical_record":{"source":{"id":"2501.00663","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2024-12-31T22:32:03Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"68ab678edefb0c80939e9ec6ad62f8f70af0a8957580f19f811a11a8a0a22891","abstract_canon_sha256":"2e206822891bb75ad3edfac5f675ae2117dd8dc18a8e770fe3e7037c8bcd6d5b"},"schema_version":"1.0"},"canonical_sha256":"b08e1150091bd20d851a1d4e7ed3a6a1b85728467986a54b1264c50b5ba05ea7","source":{"kind":"arxiv","id":"2501.00663","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2501.00663","created_at":"2026-05-17T23:39:21Z"},{"alias_kind":"arxiv_version","alias_value":"2501.00663v1","created_at":"2026-05-17T23:39:21Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.00663","created_at":"2026-05-17T23:39:21Z"},{"alias_kind":"pith_short_12","alias_value":"WCHBCUAJDPJA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WCHBCUAJDPJA3BI2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WCHBCUAJ","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:WCHBCUAJDPJA3BI2DVHH5U5GUG","target":"record","payload":{"canonical_record":{"source":{"id":"2501.00663","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2024-12-31T22:32:03Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"68ab678edefb0c80939e9ec6ad62f8f70af0a8957580f19f811a11a8a0a22891","abstract_canon_sha256":"2e206822891bb75ad3edfac5f675ae2117dd8dc18a8e770fe3e7037c8bcd6d5b"},"schema_version":"1.0"},"canonical_sha256":"b08e1150091bd20d851a1d4e7ed3a6a1b85728467986a54b1264c50b5ba05ea7","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:21.526157Z","signature_b64":"n47+gfqpZdiDc9+ju32LQ7iWVFeMjvzP7VpSuaWdEJ9c4YA5i3ptPcLuWKF8Ymq2tLQPzqVU+EWmMqV4hgSHAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b08e1150091bd20d851a1d4e7ed3a6a1b85728467986a54b1264c50b5ba05ea7","last_reissued_at":"2026-05-17T23:39:21.525493Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:21.525493Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2501.00663","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:21Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"33SjyfPlqxsJ4pqUNC6W08WF2mRglqoFFKMzQkGnsZRpXkj3FhfSAvyEXYMvLY8+gZH3PJrXlMTtRFGs2AS6Bw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T05:00:00.370711Z"},"content_sha256":"2167aab5142534e42ba88845a213c66144f7f85d38465397cddf5e2f63c5cca9","schema_version":"1.0","event_id":"sha256:2167aab5142534e42ba88845a213c66144f7f85d38465397cddf5e2f63c5cca9"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:WCHBCUAJDPJA3BI2DVHH5U5GUG","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Titans: Learning to Memorize at Test Time","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Titans combine attention with a learnable neural long-term memory to handle contexts over two million tokens more effectively than Transformers or linear recurrent models.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Ali Behrouz, Peilin Zhong, Vahab Mirrokni","submitted_at":"2024-12-31T22:32:03Z","abstract_excerpt":"Over more than a decade there has been an extensive research effort on how to effectively utilize recurrent models and attention. While recurrent models aim to compress the data into a fixed-size memory (called hidden state), attention allows attending to the entire context window, capturing the direct dependencies of all tokens. This more accurate modeling of dependencies, however, comes with a quadratic cost, limiting the model to a fixed-length context. We present a new neural long-term memory module that learns to memorize historical context and helps attention to attend to the current con"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our experimental results on language modeling, common-sense reasoning, genomics, and time series tasks show that Titans are more effective than Transformers and recent modern linear recurrent models. They further can effectively scale to larger than 2M context window size with higher accuracy in needle-in-haystack tasks compared to baselines.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the neural memory module can reliably learn to store and retrieve relevant historical information without catastrophic forgetting or introducing new failure modes that offset the claimed gains, especially when the training objective does not explicitly supervise the memory contents.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Titans combine attention for current context with a learnable neural memory for long-term history, achieving better performance and scaling to over 2M-token contexts on language, reasoning, genomics, and time-series tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Titans combine attention with a learnable neural long-term memory to handle contexts over two million tokens more effectively than Transformers or linear recurrent models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"67a82dcc8b8ecbb016e27fde910cc989bf23e7b353be2516a1cbf9b331bbe6ba"},"source":{"id":"2501.00663","kind":"arxiv","version":1},"verdict":{"id":"d0ff25ec-78b9-47b8-b1c5-e015af330588","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T22:03:30.442238Z","strongest_claim":"Our experimental results on language modeling, common-sense reasoning, genomics, and time series tasks show that Titans are more effective than Transformers and recent modern linear recurrent models. They further can effectively scale to larger than 2M context window size with higher accuracy in needle-in-haystack tasks compared to baselines.","one_line_summary":"Titans combine attention for current context with a learnable neural memory for long-term history, achieving better performance and scaling to over 2M-token contexts on language, reasoning, genomics, and time-series tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the neural memory module can reliably learn to store and retrieve relevant historical information without catastrophic forgetting or introducing new failure modes that offset the claimed gains, especially when the training objective does not explicitly supervise the memory contents.","pith_extraction_headline":"Titans combine attention with a learnable neural long-term memory to handle contexts over two million tokens more effectively than Transformers or linear recurrent models."},"references":{"count":139,"sample":[{"doi":"","year":2023,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":1,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":2024,"title":"Linear Transformers with Learnable Kernel Functions are Better In-Context Models","work_id":"deea05cd-a116-4eb0-8665-d098e04d0402","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Learning to learn by gradient descent by gradient descent","work_id":"c52938bb-b5b3-447f-a5cd-0f1078f63fa7","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Exploring length generalization in large language models","work_id":"6b3a6ccc-4c8a-4249-b864-98b23b2b57e7","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Simple linear attention language models balance the recall-throughput tradeoff","work_id":"e1002884-0294-462b-8c41-f124aa0e9c3f","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":139,"snapshot_sha256":"d89273418ef7806fc5c088b4751c8b4e6d84380f868f825f41aaed887666def8","internal_anchors":24},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0b17f49521d6fff7313e3a679483c52c4f10eeb96057e83477867d6d036cde68"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"d0ff25ec-78b9-47b8-b1c5-e015af330588"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:21Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9kaMldF+vm3fQHyYoO1tbrvE0LkIJEDk6T3HSnI3Tv8rRVww2XCK/xx++Em2Qlu637cNJalpPtF6ZoLchdgJBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T05:00:00.371362Z"},"content_sha256":"cdc31469d84cc01deac0068faf0a77601d8e71e565963e9a8704961e3aebee48","schema_version":"1.0","event_id":"sha256:cdc31469d84cc01deac0068faf0a77601d8e71e565963e9a8704961e3aebee48"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG/bundle.json","state_url":"https://pith.science/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T05:00:00Z","links":{"resolver":"https://pith.science/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG","bundle":"https://pith.science/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG/bundle.json","state":"https://pith.science/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WCHBCUAJDPJA3BI2DVHH5U5GUG/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:WCHBCUAJDPJA3BI2DVHH5U5GUG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2e206822891bb75ad3edfac5f675ae2117dd8dc18a8e770fe3e7037c8bcd6d5b","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2024-12-31T22:32:03Z","title_canon_sha256":"68ab678edefb0c80939e9ec6ad62f8f70af0a8957580f19f811a11a8a0a22891"},"schema_version":"1.0","source":{"id":"2501.00663","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2501.00663","created_at":"2026-05-17T23:39:21Z"},{"alias_kind":"arxiv_version","alias_value":"2501.00663v1","created_at":"2026-05-17T23:39:21Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2501.00663","created_at":"2026-05-17T23:39:21Z"},{"alias_kind":"pith_short_12","alias_value":"WCHBCUAJDPJA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WCHBCUAJDPJA3BI2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WCHBCUAJ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:cdc31469d84cc01deac0068faf0a77601d8e71e565963e9a8704961e3aebee48","target":"graph","created_at":"2026-05-17T23:39:21Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our experimental results on language modeling, common-sense reasoning, genomics, and time series tasks show that Titans are more effective than Transformers and recent modern linear recurrent models. They further can effectively scale to larger than 2M context window size with higher accuracy in needle-in-haystack tasks compared to baselines."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the neural memory module can reliably learn to store and retrieve relevant historical information without catastrophic forgetting or introducing new failure modes that offset the claimed gains, especially when the training objective does not explicitly supervise the memory contents."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Titans combine attention for current context with a learnable neural memory for long-term history, achieving better performance and scaling to over 2M-token contexts on language, reasoning, genomics, and time-series tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Titans combine attention with a learnable neural long-term memory to handle contexts over two million tokens more effectively than Transformers or linear recurrent models."}],"snapshot_sha256":"67a82dcc8b8ecbb016e27fde910cc989bf23e7b353be2516a1cbf9b331bbe6ba"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"0b17f49521d6fff7313e3a679483c52c4f10eeb96057e83477867d6d036cde68"},"paper":{"abstract_excerpt":"Over more than a decade there has been an extensive research effort on how to effectively utilize recurrent models and attention. While recurrent models aim to compress the data into a fixed-size memory (called hidden state), attention allows attending to the entire context window, capturing the direct dependencies of all tokens. This more accurate modeling of dependencies, however, comes with a quadratic cost, limiting the model to a fixed-length context. We present a new neural long-term memory module that learns to memorize historical context and helps attention to attend to the current con","authors_text":"Ali Behrouz, Peilin Zhong, Vahab Mirrokni","cross_cats":["cs.AI","cs.CL"],"headline":"Titans combine attention with a learnable neural long-term memory to handle contexts over two million tokens more effectively than Transformers or linear recurrent models.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2024-12-31T22:32:03Z","title":"Titans: Learning to Memorize at Test Time"},"references":{"count":139,"internal_anchors":24,"resolved_work":139,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Linear Transformers with Learnable Kernel Functions are Better In-Context Models","work_id":"deea05cd-a116-4eb0-8665-d098e04d0402","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Learning to learn by gradient descent by gradient descent","work_id":"c52938bb-b5b3-447f-a5cd-0f1078f63fa7","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Exploring length generalization in large language models","work_id":"6b3a6ccc-4c8a-4249-b864-98b23b2b57e7","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Simple linear attention language models balance the recall-throughput tradeoff","work_id":"e1002884-0294-462b-8c41-f124aa0e9c3f","year":2024}],"snapshot_sha256":"d89273418ef7806fc5c088b4751c8b4e6d84380f868f825f41aaed887666def8"},"source":{"id":"2501.00663","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T22:03:30.442238Z","id":"d0ff25ec-78b9-47b8-b1c5-e015af330588","model_set":{"reader":"grok-4.3"},"one_line_summary":"Titans combine attention for current context with a learnable neural memory for long-term history, achieving better performance and scaling to over 2M-token contexts on language, reasoning, genomics, and time-series tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Titans combine attention with a learnable neural long-term memory to handle contexts over two million tokens more effectively than Transformers or linear recurrent models.","strongest_claim":"Our experimental results on language modeling, common-sense reasoning, genomics, and time series tasks show that Titans are more effective than Transformers and recent modern linear recurrent models. They further can effectively scale to larger than 2M context window size with higher accuracy in needle-in-haystack tasks compared to baselines.","weakest_assumption":"That the neural memory module can reliably learn to store and retrieve relevant historical information without catastrophic forgetting or introducing new failure modes that offset the claimed gains, especially when the training objective does not explicitly supervise the memory contents."}},"verdict_id":"d0ff25ec-78b9-47b8-b1c5-e015af330588"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2167aab5142534e42ba88845a213c66144f7f85d38465397cddf5e2f63c5cca9","target":"record","created_at":"2026-05-17T23:39:21Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2e206822891bb75ad3edfac5f675ae2117dd8dc18a8e770fe3e7037c8bcd6d5b","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2024-12-31T22:32:03Z","title_canon_sha256":"68ab678edefb0c80939e9ec6ad62f8f70af0a8957580f19f811a11a8a0a22891"},"schema_version":"1.0","source":{"id":"2501.00663","kind":"arxiv","version":1}},"canonical_sha256":"b08e1150091bd20d851a1d4e7ed3a6a1b85728467986a54b1264c50b5ba05ea7","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b08e1150091bd20d851a1d4e7ed3a6a1b85728467986a54b1264c50b5ba05ea7","first_computed_at":"2026-05-17T23:39:21.525493Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:21.525493Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"n47+gfqpZdiDc9+ju32LQ7iWVFeMjvzP7VpSuaWdEJ9c4YA5i3ptPcLuWKF8Ymq2tLQPzqVU+EWmMqV4hgSHAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:21.526157Z","signed_message":"canonical_sha256_bytes"},"source_id":"2501.00663","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2167aab5142534e42ba88845a213c66144f7f85d38465397cddf5e2f63c5cca9","sha256:cdc31469d84cc01deac0068faf0a77601d8e71e565963e9a8704961e3aebee48"],"state_sha256":"ef64691302f5232b666cb9421a54850d837a22b7f53053cd89727909625fdfae"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OZOEBymWn73bhIybMT2tgpzqMgK8uhuxPvtwXEG3US0sqldmVBp4BVT43B+CGf8hrXd8lsRZpbmUEbp0fGAfAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T05:00:00.375153Z","bundle_sha256":"e597d960d09385b154256fe071fa6966424f1728e5d1198b4a456ac73124f301"}}