{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:4TBBZIWAJL7HUCXXWPKVFDUYVK","short_pith_number":"pith:4TBBZIWA","canonical_record":{"source":{"id":"2605.06501","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"8913458feb47a337b7afc7d86defb8185da37246dcd073a060747bc418b14666","abstract_canon_sha256":"c2768fe58619b368d91bdd52db62e66d6501c339f4f000ebd5c531ba43242e63"},"schema_version":"1.0"},"canonical_sha256":"e4c21ca2c04afe7a0af7b3d5528e98aa82282c8eb4224e702497da22c84f8755","source":{"kind":"arxiv","id":"2605.06501","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.06501","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"arxiv_version","alias_value":"2605.06501v2","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.06501","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"pith_short_12","alias_value":"4TBBZIWAJL7H","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"pith_short_16","alias_value":"4TBBZIWAJL7HUCXX","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"pith_short_8","alias_value":"4TBBZIWA","created_at":"2026-05-20T01:05:15Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:4TBBZIWAJL7HUCXXWPKVFDUYVK","target":"record","payload":{"canonical_record":{"source":{"id":"2605.06501","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"8913458feb47a337b7afc7d86defb8185da37246dcd073a060747bc418b14666","abstract_canon_sha256":"c2768fe58619b368d91bdd52db62e66d6501c339f4f000ebd5c531ba43242e63"},"schema_version":"1.0"},"canonical_sha256":"e4c21ca2c04afe7a0af7b3d5528e98aa82282c8eb4224e702497da22c84f8755","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:15.652311Z","signature_b64":"dt2cGgz5ea621GY4PYEEzkEHAGoWYnp1IvEkCrXRjs80mRXHykO3G3r0kY6IW3ngsi2KnmzQjSUDnkiGKb4IAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e4c21ca2c04afe7a0af7b3d5528e98aa82282c8eb4224e702497da22c84f8755","last_reissued_at":"2026-05-20T01:05:15.651667Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:15.651667Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.06501","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+UGZ6tHXiB4enXxG0fvwPytHuJWEIrCwH6Bbs0gOk0Q6DeqM/phaFphJPcMOQghGu4ijwK4VxzT0DMxcHl0OAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T13:58:05.877198Z"},"content_sha256":"348c5b1355eeddfa541317c2fe788e5279b9f24373e2751bc267948f06da5908","schema_version":"1.0","event_id":"sha256:348c5b1355eeddfa541317c2fe788e5279b9f24373e2751bc267948f06da5908"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:4TBBZIWAJL7HUCXXWPKVFDUYVK","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Cubit: Token Mixer with Kernel Ridge Regression","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Cubit replaces the Transformer's attention with kernel ridge regression to improve long-sequence modeling.","cross_cats":["cs.CL"],"primary_cat":"cs.LG","authors_text":"Anderson Schneider, Chuanyang Zheng, Jiankai Sun, Liangchen Tan, Mac Schwager, XiaoDong Liu, Yihang Gao, Yuehao Wang, Yuriy Nevmyvaka","submitted_at":"2026-05-07T16:18:55Z","abstract_excerpt":"Since its introduction in 2017, the Transformer has become one of the most widely adopted architectures in modern deep learning. Despite extensive efforts to improve positional encoding, attention mechanisms, and feed-forward networks, the core token-mixing mechanism in Transformers remains attention. In this work, we show that the attention module in Transformers can be interpreted as performing Nadaraya-Watson regression, where it computes similarities between tokens and aggregates the corresponding values accordingly. Motivated by this perspective, we propose Cubit, a potential next-generat"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Cubit, as a KRR-based architecture, provides a stronger mathematical foundation than the vanilla Transformer, whose attention mechanism corresponds to Nadaraya-Watson regression. The experimental results suggest that Cubit may exhibit stronger long-sequence modeling capability. In particular, its performance gain over the Transformer appears to increase as the training sequence length grows.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the closed-form KRR solution can be stably integrated into a deep network via Limited-Range Rescale and that observed performance differences are caused by the regression formulation rather than hyperparameter choices or implementation details.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Cubit replaces Transformer attention with Kernel Ridge Regression token mixing and shows potential gains on longer sequences.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Cubit replaces the Transformer's attention with kernel ridge regression to improve long-sequence modeling.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4f81a8dab054348b403ecf27f71f53a2f6cf675931b99e5f67b0cafeac7b178f"},"source":{"id":"2605.06501","kind":"arxiv","version":2},"verdict":{"id":"0bb87409-274a-45bd-80b5-2539b2db4bff","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-08T12:34:11.565589Z","strongest_claim":"Cubit, as a KRR-based architecture, provides a stronger mathematical foundation than the vanilla Transformer, whose attention mechanism corresponds to Nadaraya-Watson regression. The experimental results suggest that Cubit may exhibit stronger long-sequence modeling capability. In particular, its performance gain over the Transformer appears to increase as the training sequence length grows.","one_line_summary":"Cubit replaces Transformer attention with Kernel Ridge Regression token mixing and shows potential gains on longer sequences.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the closed-form KRR solution can be stably integrated into a deep network via Limited-Range Rescale and that observed performance differences are caused by the regression formulation rather than hyperparameter choices or implementation details.","pith_extraction_headline":"Cubit replaces the Transformer's attention with kernel ridge regression to improve long-sequence modeling."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.06501/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:19.742068Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T12:37:05.299459Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"194291d0f6faa3cfb073ecf861eff375d8304f66490d26d9b60684b986506a13"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0bb87409-274a-45bd-80b5-2539b2db4bff"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SF10tliGXxoKCEmTfldhgmaeEhT5TnWXIao8j+Vw2CeeGKNEkOojrn+6U7myYFrBwdmqtwi+Yawv5YkoA/IfAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T13:58:05.877755Z"},"content_sha256":"a6b2b0fd2920f168e3cdbbad416b964f0520ea2ec11a3a180ebaf5af3a0b8ecc","schema_version":"1.0","event_id":"sha256:a6b2b0fd2920f168e3cdbbad416b964f0520ea2ec11a3a180ebaf5af3a0b8ecc"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK/bundle.json","state_url":"https://pith.science/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T13:58:05Z","links":{"resolver":"https://pith.science/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK","bundle":"https://pith.science/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK/bundle.json","state":"https://pith.science/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK/state.json","well_known_bundle":"https://pith.science/.well-known/pith/4TBBZIWAJL7HUCXXWPKVFDUYVK/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:4TBBZIWAJL7HUCXXWPKVFDUYVK","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c2768fe58619b368d91bdd52db62e66d6501c339f4f000ebd5c531ba43242e63","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55Z","title_canon_sha256":"8913458feb47a337b7afc7d86defb8185da37246dcd073a060747bc418b14666"},"schema_version":"1.0","source":{"id":"2605.06501","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.06501","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"arxiv_version","alias_value":"2605.06501v2","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.06501","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"pith_short_12","alias_value":"4TBBZIWAJL7H","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"pith_short_16","alias_value":"4TBBZIWAJL7HUCXX","created_at":"2026-05-20T01:05:15Z"},{"alias_kind":"pith_short_8","alias_value":"4TBBZIWA","created_at":"2026-05-20T01:05:15Z"}],"graph_snapshots":[{"event_id":"sha256:a6b2b0fd2920f168e3cdbbad416b964f0520ea2ec11a3a180ebaf5af3a0b8ecc","target":"graph","created_at":"2026-05-20T01:05:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Cubit, as a KRR-based architecture, provides a stronger mathematical foundation than the vanilla Transformer, whose attention mechanism corresponds to Nadaraya-Watson regression. The experimental results suggest that Cubit may exhibit stronger long-sequence modeling capability. In particular, its performance gain over the Transformer appears to increase as the training sequence length grows."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the closed-form KRR solution can be stably integrated into a deep network via Limited-Range Rescale and that observed performance differences are caused by the regression formulation rather than hyperparameter choices or implementation details."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Cubit replaces Transformer attention with Kernel Ridge Regression token mixing and shows potential gains on longer sequences."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Cubit replaces the Transformer's attention with kernel ridge regression to improve long-sequence modeling."}],"snapshot_sha256":"4f81a8dab054348b403ecf27f71f53a2f6cf675931b99e5f67b0cafeac7b178f"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T18:01:19.742068Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T12:37:05.299459Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.06501/integrity.json","findings":[],"snapshot_sha256":"194291d0f6faa3cfb073ecf861eff375d8304f66490d26d9b60684b986506a13","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Since its introduction in 2017, the Transformer has become one of the most widely adopted architectures in modern deep learning. Despite extensive efforts to improve positional encoding, attention mechanisms, and feed-forward networks, the core token-mixing mechanism in Transformers remains attention. In this work, we show that the attention module in Transformers can be interpreted as performing Nadaraya-Watson regression, where it computes similarities between tokens and aggregates the corresponding values accordingly. Motivated by this perspective, we propose Cubit, a potential next-generat","authors_text":"Anderson Schneider, Chuanyang Zheng, Jiankai Sun, Liangchen Tan, Mac Schwager, XiaoDong Liu, Yihang Gao, Yuehao Wang, Yuriy Nevmyvaka","cross_cats":["cs.CL"],"headline":"Cubit replaces the Transformer's attention with kernel ridge regression to improve long-sequence modeling.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55Z","title":"Cubit: Token Mixer with Kernel Ridge Regression"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.06501","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-08T12:34:11.565589Z","id":"0bb87409-274a-45bd-80b5-2539b2db4bff","model_set":{"reader":"grok-4.3"},"one_line_summary":"Cubit replaces Transformer attention with Kernel Ridge Regression token mixing and shows potential gains on longer sequences.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Cubit replaces the Transformer's attention with kernel ridge regression to improve long-sequence modeling.","strongest_claim":"Cubit, as a KRR-based architecture, provides a stronger mathematical foundation than the vanilla Transformer, whose attention mechanism corresponds to Nadaraya-Watson regression. The experimental results suggest that Cubit may exhibit stronger long-sequence modeling capability. In particular, its performance gain over the Transformer appears to increase as the training sequence length grows.","weakest_assumption":"That the closed-form KRR solution can be stably integrated into a deep network via Limited-Range Rescale and that observed performance differences are caused by the regression formulation rather than hyperparameter choices or implementation details."}},"verdict_id":"0bb87409-274a-45bd-80b5-2539b2db4bff"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:348c5b1355eeddfa541317c2fe788e5279b9f24373e2751bc267948f06da5908","target":"record","created_at":"2026-05-20T01:05:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c2768fe58619b368d91bdd52db62e66d6501c339f4f000ebd5c531ba43242e63","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:18:55Z","title_canon_sha256":"8913458feb47a337b7afc7d86defb8185da37246dcd073a060747bc418b14666"},"schema_version":"1.0","source":{"id":"2605.06501","kind":"arxiv","version":2}},"canonical_sha256":"e4c21ca2c04afe7a0af7b3d5528e98aa82282c8eb4224e702497da22c84f8755","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"e4c21ca2c04afe7a0af7b3d5528e98aa82282c8eb4224e702497da22c84f8755","first_computed_at":"2026-05-20T01:05:15.651667Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:15.651667Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"dt2cGgz5ea621GY4PYEEzkEHAGoWYnp1IvEkCrXRjs80mRXHykO3G3r0kY6IW3ngsi2KnmzQjSUDnkiGKb4IAQ==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:15.652311Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.06501","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:348c5b1355eeddfa541317c2fe788e5279b9f24373e2751bc267948f06da5908","sha256:a6b2b0fd2920f168e3cdbbad416b964f0520ea2ec11a3a180ebaf5af3a0b8ecc"],"state_sha256":"67521e80bc2315616d255a1d6b00b20d6e630a6dc91f4233a0951c122c9b5182"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"uF0ajFO5XiN2E4nJ8fTRMCtjkg5GM9lJoSHizbEitNy+J1MHxavDeIyOe2P+plyYn5Vy/6OzSK9RNo60j4ObCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T13:58:05.880345Z","bundle_sha256":"dce7c12b5ec0ffaefb10f8ef0a5d161e5a8b0ea2ca548deb2d2df528c2b5462a"}}