{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:TN6TYKXHBTFO5YS4ZQKFT4CEP5","short_pith_number":"pith:TN6TYKXH","schema_version":"1.0","canonical_sha256":"9b7d3c2ae70ccaeee25ccc1459f0447f7e2a46c7cf1ca4d52ab35e26f0bf7927","source":{"kind":"arxiv","id":"2505.23884","version":1},"attestation_state":"computed","paper":{"title":"Test-Time Training Done Right","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Large-chunk updates during inference make test-time training efficient enough to scale nonlinear states to 40 percent of model parameters.","cross_cats":["cs.CL","cs.CV"],"primary_cat":"cs.LG","authors_text":"Fujun Luan, Hao Tan, Kai Zhang, Kalyan Sunkavalli, Sai Bi, Songlin Yang, Tianyuan Zhang, William T. Freeman, Yicong Hong","submitted_at":"2025-05-29T17:50:34Z","abstract_excerpt":"Test-Time Training (TTT) models context dependencies by adapting part of the model's weights (referred to as fast weights) during inference. This fast weight, akin to recurrent states in RNNs, stores temporary memories of past tokens in the current sequence. Existing TTT methods struggled to show effectiveness in handling long-context data, due to their inefficiency on modern GPUs. The TTT layers in many of these approaches operate with extremely low FLOPs utilization (often <5%) because they deliberately apply small online minibatch sizes (e.g., updating fast weights every 16 or 64 tokens). M"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2505.23884","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-05-29T17:50:34Z","cross_cats_sorted":["cs.CL","cs.CV"],"title_canon_sha256":"8d890dd60c819654346bdec1702e4247b845985dd9767bc783261caa25ebde1e","abstract_canon_sha256":"3d69dd83bd8506725dbd63ec554147402277ee729a95527a096d51c9e74cc2b2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.037625Z","signature_b64":"/Zz0yogIeqW6/Q+7fscOYM5QfHNzOvmqXGKfq2687NGONClDKi5q9IrNhVxKD4XXK7PAD+pfJxRLVwB/uNmGAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9b7d3c2ae70ccaeee25ccc1459f0447f7e2a46c7cf1ca4d52ab35e26f0bf7927","last_reissued_at":"2026-05-17T23:38:48.037169Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.037169Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Test-Time Training Done Right","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Large-chunk updates during inference make test-time training efficient enough to scale nonlinear states to 40 percent of model parameters.","cross_cats":["cs.CL","cs.CV"],"primary_cat":"cs.LG","authors_text":"Fujun Luan, Hao Tan, Kai Zhang, Kalyan Sunkavalli, Sai Bi, Songlin Yang, Tianyuan Zhang, William T. Freeman, Yicong Hong","submitted_at":"2025-05-29T17:50:34Z","abstract_excerpt":"Test-Time Training (TTT) models context dependencies by adapting part of the model's weights (referred to as fast weights) during inference. This fast weight, akin to recurrent states in RNNs, stores temporary memories of past tokens in the current sequence. Existing TTT methods struggled to show effectiveness in handling long-context data, due to their inefficiency on modern GPUs. The TTT layers in many of these approaches operate with extremely low FLOPs utilization (often <5%) because they deliberately apply small online minibatch sizes (e.g., updating fast weights every 16 or 64 tokens). M"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"LaCT improves hardware utilization by orders of magnitude, facilitates scaling of nonlinear state size (up to 40% of model parameters), and enables 14B-parameter AR video diffusion on 56K tokens and 1M-token novel view synthesis without custom kernels.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That performing weight updates on extremely large chunks (2K–1M tokens) preserves or improves modeling quality compared with the fine-grained causal updates used in prior TTT work.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Large-chunk online updates during inference let test-time training scale state capacity to 40% of model size and handle contexts up to 1M tokens without custom kernels.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large-chunk updates during inference make test-time training efficient enough to scale nonlinear states to 40 percent of model parameters.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"45fd7ad930b4f0f9de25eda15400f3da3be0caaecde495345ee79f5310d14927"},"source":{"id":"2505.23884","kind":"arxiv","version":1},"verdict":{"id":"38c3eaa0-b718-4de0-af64-d6d1fe20b0d4","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T11:21:09.142492Z","strongest_claim":"LaCT improves hardware utilization by orders of magnitude, facilitates scaling of nonlinear state size (up to 40% of model parameters), and enables 14B-parameter AR video diffusion on 56K tokens and 1M-token novel view synthesis without custom kernels.","one_line_summary":"Large-chunk online updates during inference let test-time training scale state capacity to 40% of model size and handle contexts up to 1M tokens without custom kernels.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That performing weight updates on extremely large chunks (2K–1M tokens) preserves or improves modeling quality compared with the fine-grained causal updates used in prior TTT work.","pith_extraction_headline":"Large-chunk updates during inference make test-time training efficient enough to scale nonlinear states to 40 percent of model parameters."},"references":{"count":75,"sample":[{"doi":"","year":2017,"title":"Attention is all you need","work_id":"7a952c5f-f5c4-4a0a-bc7f-ed068389d046","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Learning to (Learn at Test Time): RNNs with Expressive Hidden States","work_id":"c682430c-e7a2-4699-b82d-55287448dbba","ref_index":2,"cited_arxiv_id":"2407.04620","is_internal_anchor":true},{"doi":"","year":2021,"title":"Linear transformers are secretly fast weight programmers","work_id":"ae448930-6886-42c9-805c-97ecbc17cbbc","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Ke Alexander Wang, Jiaxin Shi, and Emily B. Fox. Test-time regression: a unifying framework for designing sequence models with associative memory, 2025","work_id":"44a6409a-3393-4270-aa11-8d7bc801effe","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Titans: Learning to Memorize at Test Time","work_id":"fb2b7625-b733-43cb-af52-00b0a31a8d7f","ref_index":5,"cited_arxiv_id":"2501.00663","is_internal_anchor":true}],"resolved_work":75,"snapshot_sha256":"4ba522b94a6af93d8af7b6df2144a50c6d67ae7d1aa6c0b12c145ece4f5d9063","internal_anchors":19},"formal_canon":{"evidence_count":3,"snapshot_sha256":"0a1c32f2d7dca4ba20ad95b79fae0d9fbcbbdf98a56bfd7bfcbd04eca39c731f"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2505.23884","created_at":"2026-05-17T23:38:48.037245+00:00"},{"alias_kind":"arxiv_version","alias_value":"2505.23884v1","created_at":"2026-05-17T23:38:48.037245+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.23884","created_at":"2026-05-17T23:38:48.037245+00:00"},{"alias_kind":"pith_short_12","alias_value":"TN6TYKXHBTFO","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"TN6TYKXHBTFO5YS4","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"TN6TYKXH","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":30,"internal_anchor_count":30,"sample":[{"citing_arxiv_id":"2603.04639","citing_title":"RoboMME: Benchmarking and Understanding Memory for Robotic Generalist Policies","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20708","citing_title":"Rethinking Cross-Layer Information Routing in Diffusion Transformers","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06270","citing_title":"Spark3R: Asymmetric Token Reduction Makes Fast Feed-Forward 3D Reconstruction","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15824","citing_title":"FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17478","citing_title":"Mamba-VGGT: Persistent Long-Sequence Video Geometry Grounded Transformer via External Sliding Window Mamba Memory","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18739","citing_title":"LongLive-2.0: An NVFP4 Parallel Infrastructure for Long Video Generation","ref_index":76,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19444","citing_title":"When the Majority Votes Wrong, the Intervention Timing for Test-Time Reinforcement Learning Hides in the Extinction Window","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2509.22630","citing_title":"StateX: Enhancing RNN Recall via Post-training State Expansion","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2510.24718","citing_title":"Generative View Stitching","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2509.26645","citing_title":"TTT3R: 3D Reconstruction as Test-Time Training","ref_index":103,"is_internal_anchor":true},{"citing_arxiv_id":"2512.01643","citing_title":"ViT$^3$: Unlocking Test-Time Training in Vision","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10267","citing_title":"Long-LRM++: Preserving Fine Details in Feed-Forward Wide-Coverage Reconstruction","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2509.25161","citing_title":"Rolling Forcing: Autoregressive Long Video Diffusion in Real Time","ref_index":110,"is_internal_anchor":true},{"citing_arxiv_id":"2602.07775","citing_title":"Rolling Sink: Bridging Limited-Horizon Training and Open-Ended Testing in Autoregressive Video Diffusion","ref_index":111,"is_internal_anchor":true},{"citing_arxiv_id":"2602.21204","citing_title":"Test-Time Training with KV Binding Is Secretly Linear Attention","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2603.04385","citing_title":"ZipMap: Linear-Time Stateful 3D Reconstruction via Test-Time Training","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22241","citing_title":"MemDLM: Memory-Enhanced DLM Training","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2603.29002","citing_title":"Understand and Accelerate Memory Processing Pipeline for Disaggregated LLM Inference","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08587","citing_title":"Kaczmarz Linear Attention","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06270","citing_title":"Spark3R: Asymmetric Token Reduction Makes Fast Feed-Forward 3D Reconstruction","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05838","citing_title":"MDN: Parallelizing Stepwise Momentum for Delta Linear Attention","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13036","citing_title":"Lyra 2.0: Explorable Generative 3D Worlds","ref_index":137,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08542","citing_title":"Scal3R: Scalable Test-Time Training for Large-Scale 3D Reconstruction","ref_index":100,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08846","citing_title":"Dictionary-Aligned Concept Control for Safeguarding Multimodal LLMs","ref_index":128,"is_internal_anchor":true},{"citing_arxiv_id":"2506.08009","citing_title":"Self Forcing: Bridging the Train-Test Gap in Autoregressive Video Diffusion","ref_index":106,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":3,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5","json":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5.json","graph_json":"https://pith.science/api/pith-number/TN6TYKXHBTFO5YS4ZQKFT4CEP5/graph.json","events_json":"https://pith.science/api/pith-number/TN6TYKXHBTFO5YS4ZQKFT4CEP5/events.json","paper":"https://pith.science/paper/TN6TYKXH"},"agent_actions":{"view_html":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5","download_json":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5.json","view_paper":"https://pith.science/paper/TN6TYKXH","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2505.23884&json=true","fetch_graph":"https://pith.science/api/pith-number/TN6TYKXHBTFO5YS4ZQKFT4CEP5/graph.json","fetch_events":"https://pith.science/api/pith-number/TN6TYKXHBTFO5YS4ZQKFT4CEP5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5/action/storage_attestation","attest_author":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5/action/author_attestation","sign_citation":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5/action/citation_signature","submit_replication":"https://pith.science/pith/TN6TYKXHBTFO5YS4ZQKFT4CEP5/action/replication_record"}},"created_at":"2026-05-17T23:38:48.037245+00:00","updated_at":"2026-05-17T23:38:48.037245+00:00"}