{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:BJQSU5IAHQ3UUDLLXFHSU5PRVH","short_pith_number":"pith:BJQSU5IA","schema_version":"1.0","canonical_sha256":"0a612a75003c374a0d6bb94f2a75f1a9e4a56978cee08ee17fa06c7ee83a2611","source":{"kind":"arxiv","id":"2502.11089","version":2},"attestation_state":"computed","paper":{"title":"Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"NSA introduces a natively trainable sparse attention that matches full attention performance on long contexts while delivering major speedups.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Chong Ruan, Damai Dai, Huazuo Gao, Jingyang Yuan, Junyu Luo, Lean Wang, Liang Zhao, Ming Zhang, Wangding Zeng, Wenfeng Liang, Yuqing Wang, Y. X. Wei, Zhenda Xie, Zhengyan Zhang, Zhiping Xiao","submitted_at":"2025-02-16T11:53:44Z","abstract_excerpt":"Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with hardware-aligned optimizations to achieve efficient long-context modeling. NSA employs a dynamic hierarchical sparse strategy, combining coarse-grained token compression with fine-grained token selection"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2502.11089","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-02-16T11:53:44Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"1ee096cb16e96cc1bce6e3699add8767e82e700c718bc04601268dca1b884e55","abstract_canon_sha256":"5c3871eff56bff92b8a0a30d0db50d9978710d95f1d6dbf729b36efb96d7340f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:46.196552Z","signature_b64":"+7fJa74wm+RgJ8zVBoXQbE+q5HelXGSFe3TbvdMCADpc6Jser91aBk+n1HFP0zgdW9JCz3t6zrEXuUJpRrnyDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0a612a75003c374a0d6bb94f2a75f1a9e4a56978cee08ee17fa06c7ee83a2611","last_reissued_at":"2026-05-17T23:38:46.196120Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:46.196120Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"NSA introduces a natively trainable sparse attention that matches full attention performance on long contexts while delivering major speedups.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Chong Ruan, Damai Dai, Huazuo Gao, Jingyang Yuan, Junyu Luo, Lean Wang, Liang Zhao, Ming Zhang, Wangding Zeng, Wenfeng Liang, Yuqing Wang, Y. X. Wei, Zhenda Xie, Zhengyan Zhang, Zhiping Xiao","submitted_at":"2025-02-16T11:53:44Z","abstract_excerpt":"Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with hardware-aligned optimizations to achieve efficient long-context modeling. NSA employs a dynamic hierarchical sparse strategy, combining coarse-grained token compression with fine-grained token selection"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experiments show the model pretrained with NSA maintains or exceeds Full Attention models across general benchmarks, long-context tasks, and instruction-based reasoning, while achieving substantial speedups over Full Attention on 64k-length sequences across decoding, forward, and backward propagation.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the dynamic hierarchical sparse strategy (coarse compression plus fine selection) preserves both global context awareness and local precision without introducing systematic biases that would degrade performance on unseen long-context distributions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"NSA is a hardware-aligned sparse attention mechanism that enables end-to-end trainable long-context modeling by combining coarse token compression with fine-grained selection.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"NSA introduces a natively trainable sparse attention that matches full attention performance on long contexts while delivering major speedups.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6b3d73f22a95be9348650313a8803a1ed44cfbbb8dbbc57619a748c5a6de06be"},"source":{"id":"2502.11089","kind":"arxiv","version":2},"verdict":{"id":"9296ca77-a347-49d3-af7e-5de7c3339b88","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T23:43:08.833294Z","strongest_claim":"Experiments show the model pretrained with NSA maintains or exceeds Full Attention models across general benchmarks, long-context tasks, and instruction-based reasoning, while achieving substantial speedups over Full Attention on 64k-length sequences across decoding, forward, and backward propagation.","one_line_summary":"NSA is a hardware-aligned sparse attention mechanism that enables end-to-end trainable long-context modeling by combining coarse token compression with fine-grained selection.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the dynamic hierarchical sparse strategy (coarse compression plus fine selection) preserves both global context awareness and local precision without introducing systematic biases that would degrade performance on unseen long-context distributions.","pith_extraction_headline":"NSA introduces a natively trainable sparse attention that matches full attention performance on long contexts while delivering major speedups."},"references":{"count":65,"sample":[{"doi":"","year":2024,"title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","work_id":"1e1df141-cac8-47fd-b068-c4c96e51e331","ref_index":10,"cited_arxiv_id":"2405.04434","is_internal_anchor":true},{"doi":"","year":2025,"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","ref_index":11,"cited_arxiv_id":"2501.12948","is_internal_anchor":true},{"doi":"","year":2023,"title":"G. Kamradt. LLMTest NeedleInAHaystack . GitHub repository, 2023. URL https://github.com/gkamradt/LLMTest_NeedleInAHaystack. Accessed: [Insert Access Date Here]","work_id":"bd0cb6f6-cd09-4320-9bce-7ea216ab6b50","ref_index":22,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"J. S. Park, J. C. O'Brien, C. J. Cai, M. R. Morris, P. Liang, and M. S. Bernstein. Generative agents: Interactive simulacra of human behavior. In S. Follmer, J. Han, J. Steimle, and N. H. Riche, edito","work_id":"e27cce8a-f8c3-4deb-83b6-ae0bbf253c46","ref_index":26,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"B. Peng, J. Quesnelle, H. Fan, and E. Shippole. Yarn: Efficient context window extension of large language models. In ICLR . OpenReview.net, 2024","work_id":"6bae5175-6cd1-4631-a964-7b1984532309","ref_index":27,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":65,"snapshot_sha256":"2c0626aaef195172790711f872b821201df63ce4cd888875a85a878a66d0a2b0","internal_anchors":24},"formal_canon":{"evidence_count":2,"snapshot_sha256":"812bf6ac4b00b9505aba82c7f1ea18232561654ac7cc322ff60e51eb70083e11"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.11089","created_at":"2026-05-17T23:38:46.196188+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.11089v2","created_at":"2026-05-17T23:38:46.196188+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.11089","created_at":"2026-05-17T23:38:46.196188+00:00"},{"alias_kind":"pith_short_12","alias_value":"BJQSU5IAHQ3U","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"BJQSU5IAHQ3UUDLL","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"BJQSU5IA","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":26,"internal_anchor_count":26,"sample":[{"citing_arxiv_id":"2512.13368","citing_title":"BlossomRec: Block-level Fused Sparse Attention Mechanism for Sequential Recommendations","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2510.18830","citing_title":"MTraining: Distributed Dynamic Sparse Attention for Efficient Ultra-Long Context Training","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2512.19179","citing_title":"CascadeInfer: Length-Aware Scheduling of LLM Serving with Low Latency and Load Balancing","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19652","citing_title":"Characterizing Real-World Bugs in Tile Programs for Automated Bug Detection","ref_index":90,"is_internal_anchor":true},{"citing_arxiv_id":"2508.16703","citing_title":"ShadowNPU: System and Algorithm Co-design for NPU-Centric On-Device LLM Inference","ref_index":79,"is_internal_anchor":true},{"citing_arxiv_id":"2510.09883","citing_title":"DELTA: Dynamic Layer-Aware Token Attention for Efficient Long-Context Reasoning","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2510.18245","citing_title":"Scaling Laws Meet Model Architecture: Toward Inference-Efficient LLMs","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2511.03092","citing_title":"SnapStream: Efficient Long Sequence Decoding on Dataflow Accelerators","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2512.12131","citing_title":"BOOST: BOttleneck-Optimized Scalable Training Framework for Low-Rank Large Language Models","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2512.12087","citing_title":"BLASST: Dynamic BLocked Attention Sparsity via Softmax Thresholding","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2507.02259","citing_title":"MemAgent: Reshaping Long-Context LLM with Multi-Conv RL-based Memory Agent","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14037","citing_title":"Self-Pruned Key-Value Attention: Learning When to Write by Predicting Future Utility","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2603.25777","citing_title":"Challenges and opportunities for AI to help deliver fusion energy","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13465","citing_title":"Z-Order Transformer for Feed-Forward Gaussian Splatting","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2510.26692","citing_title":"Kimi Linear: An Expressive, Efficient Attention Architecture","ref_index":122,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11733","citing_title":"Position: LLM Inference Should Be Evaluated as Energy-to-Token Production","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2506.13585","citing_title":"MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2505.06708","citing_title":"Gated Attention for Large Language Models: Non-linearity, Sparsity, and Attention-Sink-Free","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22312","citing_title":"Guess-Verify-Refine: Data-Aware Top-K for Sparse-Attention Decoding on Blackwell via Temporal Correlation","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06554","citing_title":"Long Context Pre-Training with Lighthouse Attention","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05365","citing_title":"ZAYA1-8B Technical Report","ref_index":145,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18137","citing_title":"AQPIM: Breaking the PIM Capacity Wall for LLMs with In-Memory Activation Quantization","ref_index":72,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07363","citing_title":"MISA: Mixture of Indexer Sparse Attention for Long-Context LLM Inference","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06169","citing_title":"In-Place Test-Time Training","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18002","citing_title":"Neural Garbage Collection: Learning to Forget while Learning to Reason","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH","json":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH.json","graph_json":"https://pith.science/api/pith-number/BJQSU5IAHQ3UUDLLXFHSU5PRVH/graph.json","events_json":"https://pith.science/api/pith-number/BJQSU5IAHQ3UUDLLXFHSU5PRVH/events.json","paper":"https://pith.science/paper/BJQSU5IA"},"agent_actions":{"view_html":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH","download_json":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH.json","view_paper":"https://pith.science/paper/BJQSU5IA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.11089&json=true","fetch_graph":"https://pith.science/api/pith-number/BJQSU5IAHQ3UUDLLXFHSU5PRVH/graph.json","fetch_events":"https://pith.science/api/pith-number/BJQSU5IAHQ3UUDLLXFHSU5PRVH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH/action/storage_attestation","attest_author":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH/action/author_attestation","sign_citation":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH/action/citation_signature","submit_replication":"https://pith.science/pith/BJQSU5IAHQ3UUDLLXFHSU5PRVH/action/replication_record"}},"created_at":"2026-05-17T23:38:46.196188+00:00","updated_at":"2026-05-17T23:38:46.196188+00:00"}