{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:OKUXID3IZPOT5HL2TRVXZMW3ZK","short_pith_number":"pith:OKUXID3I","schema_version":"1.0","canonical_sha256":"72a9740f68cbdd3e9d7a9c6b7cb2dbca80f883d205f4eca7a7ca65488e2bee22","source":{"kind":"arxiv","id":"2506.05233","version":2},"attestation_state":"computed","paper":{"title":"MesaNet: Sequence Modeling by Locally Optimal Test-Time Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Alexander Meulemans, Blaise Ag\\\"uera y Arcas, Charlotte Frenkel, Guillaume Lajoie, Jo\\~ao Sacramento, Johannes von Oswald, Kaitlin Maile, Luca Versari, Maximilian Schlegel, Nino Scherrer, Oliver Sieberling, Razvan Pascanu, Rif A. Saurous, Sarthak Mittal, Seijin Kobayashi, Songlin Yang, Yanick Schimpf","submitted_at":"2025-06-05T16:50:23Z","abstract_excerpt":"Sequence modeling is currently dominated by causal transformer architectures that use softmax self-attention. Although widely adopted, transformers require scaling memory and compute linearly during inference. A recent stream of work linearized the softmax operation, resulting in powerful recurrent neural network (RNN) models with constant memory and compute costs such as DeltaNet, Mamba or xLSTM. These models can be unified by noting that their recurrent layer dynamics can all be derived from an in-context regression objective, approximately optimized through an online learning rule. Here, we"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2506.05233","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-06-05T16:50:23Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"a1f453fd9288457390b873f8bc0928b651c9375fa729920a102e14421c7879f2","abstract_canon_sha256":"af208f80fe53afbf084fe8cb7378bf097e76e23a6d535a2ea47f0a314eae07c4"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-04T01:09:37.640749Z","signature_b64":"OzBxk9ppRy1iNdS2w6kL4PAIJ8fMbEbQUSdLTNb8v+rxcKukGZAl5MA5B6KDhG0gcq78sgU9Kdo36eqHIPC5DA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"72a9740f68cbdd3e9d7a9c6b7cb2dbca80f883d205f4eca7a7ca65488e2bee22","last_reissued_at":"2026-06-04T01:09:37.639906Z","signature_status":"signed_v1","first_computed_at":"2026-06-04T01:09:37.639906Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MesaNet: Sequence Modeling by Locally Optimal Test-Time Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Alexander Meulemans, Blaise Ag\\\"uera y Arcas, Charlotte Frenkel, Guillaume Lajoie, Jo\\~ao Sacramento, Johannes von Oswald, Kaitlin Maile, Luca Versari, Maximilian Schlegel, Nino Scherrer, Oliver Sieberling, Razvan Pascanu, Rif A. Saurous, Sarthak Mittal, Seijin Kobayashi, Songlin Yang, Yanick Schimpf","submitted_at":"2025-06-05T16:50:23Z","abstract_excerpt":"Sequence modeling is currently dominated by causal transformer architectures that use softmax self-attention. Although widely adopted, transformers require scaling memory and compute linearly during inference. A recent stream of work linearized the softmax operation, resulting in powerful recurrent neural network (RNN) models with constant memory and compute costs such as DeltaNet, Mamba or xLSTM. These models can be unified by noting that their recurrent layer dynamics can all be derived from an in-context regression objective, approximately optimized through an online learning rule. Here, we"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.05233","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2506.05233/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2506.05233","created_at":"2026-06-04T01:09:37.639996+00:00"},{"alias_kind":"arxiv_version","alias_value":"2506.05233v2","created_at":"2026-06-04T01:09:37.639996+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.05233","created_at":"2026-06-04T01:09:37.639996+00:00"},{"alias_kind":"pith_short_12","alias_value":"OKUXID3IZPOT","created_at":"2026-06-04T01:09:37.639996+00:00"},{"alias_kind":"pith_short_16","alias_value":"OKUXID3IZPOT5HL2","created_at":"2026-06-04T01:09:37.639996+00:00"},{"alias_kind":"pith_short_8","alias_value":"OKUXID3I","created_at":"2026-06-04T01:09:37.639996+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":9,"internal_anchor_count":9,"sample":[{"citing_arxiv_id":"2511.21016","citing_title":"Gated KalmaNet: A Fading Memory Layer Through Test-Time Ridge Regression","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2510.27258","citing_title":"Higher-order Linear Attention","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2509.19349","citing_title":"ShinkaEvolve: Towards Open-Ended And Sample-Efficient Program Evolution","ref_index":192,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13473","citing_title":"OSDN: Improving Delta Rule with Provable Online Preconditioning in Linear Attention","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2506.13585","citing_title":"MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08301","citing_title":"Priming: Hybrid State Space Models From Pre-trained Transformers","ref_index":87,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05838","citing_title":"MDN: Parallelizing Stepwise Momentum for Delta Linear Attention","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06997","citing_title":"Echo: KV-Cache-Free Associative Recall with Spectral Koopman Operators","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21100","citing_title":"Preconditioned DeltaNet: Curvature-aware Sequence Modeling for Linear Recurrences","ref_index":55,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK","json":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK.json","graph_json":"https://pith.science/api/pith-number/OKUXID3IZPOT5HL2TRVXZMW3ZK/graph.json","events_json":"https://pith.science/api/pith-number/OKUXID3IZPOT5HL2TRVXZMW3ZK/events.json","paper":"https://pith.science/paper/OKUXID3I"},"agent_actions":{"view_html":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK","download_json":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK.json","view_paper":"https://pith.science/paper/OKUXID3I","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2506.05233&json=true","fetch_graph":"https://pith.science/api/pith-number/OKUXID3IZPOT5HL2TRVXZMW3ZK/graph.json","fetch_events":"https://pith.science/api/pith-number/OKUXID3IZPOT5HL2TRVXZMW3ZK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK/action/storage_attestation","attest_author":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK/action/author_attestation","sign_citation":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK/action/citation_signature","submit_replication":"https://pith.science/pith/OKUXID3IZPOT5HL2TRVXZMW3ZK/action/replication_record"}},"created_at":"2026-06-04T01:09:37.639996+00:00","updated_at":"2026-06-04T01:09:37.639996+00:00"}