{"work":{"id":"160ea164-b1d4-4adb-8ccb-a4655d8a0bb4","openalex_id":null,"doi":null,"arxiv_id":"1911.02150","raw_key":null,"title":"Fast Transformer Decoding: One Write-Head is All You Need","authors":null,"authors_text":"Noam Shazeer","year":2019,"venue":"cs.NE","abstract":"Multi-head attention layers, as used in the Transformer neural sequence model, are a powerful alternative to RNNs for moving information across and between sequences. While training these layers is generally fast and simple, due to parallelizability across the length of the sequence, incremental inference (where such paralleization is impossible) is often slow, due to the memory-bandwidth cost of repeatedly loading the large \"keys\" and \"values\" tensors. We propose a variant called multi-query attention, where the keys and values are shared across all of the different attention \"heads\", greatly reducing the size of these tensors and hence the memory bandwidth requirements of incremental decoding. We verify experimentally that the resulting models can indeed be much faster to decode, and incur only minor quality degradation from the baseline.","external_url":"https://arxiv.org/abs/1911.02150","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T08:43:15.921059+00:00","pith_arxiv_id":"1911.02150","created_at":"2026-05-09T06:15:37.701667+00:00","updated_at":"2026-06-29T08:43:15.921059+00:00","title_quality_ok":true,"display_title":"Fast Transformer Decoding: One Write-Head is All You Need","render_title":"Fast Transformer Decoding: One Write-Head is All You Need"},"hub":{"state":{"work_id":"160ea164-b1d4-4adb-8ccb-a4655d8a0bb4","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":90,"external_cited_by_count":null,"distinct_field_count":11,"first_pith_cited_at":"2020-06-30T10:42:02+00:00","last_pith_cited_at":"2026-05-30T05:05:24+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T10:48:36.909278+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":17},{"context_role":"method","n":6},{"context_role":"dataset","n":1}],"polarity_counts":[{"context_polarity":"background","n":16},{"context_polarity":"use_method","n":5},{"context_polarity":"unclear","n":2},{"context_polarity":"use_dataset","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T13:31:02.313374+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints","work_id":"b73ad5b2-e553-4c71-b0c9-67e67ba7b158","shared_citers":19},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":13},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":12},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":11},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":11},{"title":"Longformer: The Long-Document Transformer","work_id":"abea7a44-6668-4de7-aab6-f53a6e5aa088","shared_citers":10},{"title":"RoFormer: Enhanced Transformer with Rotary Position Embedding","work_id":"4e5eee26-cd04-4c7a-988f-3e6d1a1f0eb9","shared_citers":10},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":9},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":9},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":9},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":9},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":9},{"title":"Efficient Streaming Language Models with Attention Sinks","work_id":"a8d25452-c237-48c9-88a4-682717c3979a","shared_citers":8},{"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","shared_citers":8},{"title":"Linformer: Self-Attention with Linear Complexity","work_id":"4b717b51-6098-45d0-8e9e-b69bef651bc3","shared_citers":8},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":8},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":8},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":8},{"title":"GLU Variants Improve Transformer","work_id":"17d0763c-1016-41ab-a478-478e890765eb","shared_citers":7},{"title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers","work_id":"19ed8c44-202a-48f6-8169-637d5a5f2408","shared_citers":7},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":7},{"title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","work_id":"1e1df141-cac8-47fd-b068-c4c96e51e331","shared_citers":6},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":6},{"title":"Generating Long Sequences with Sparse Transformers","work_id":"c5b81688-45ee-4a9a-b095-e6290f45cb6c","shared_citers":6}],"time_series":[{"n":1,"year":2020},{"n":2,"year":2022},{"n":6,"year":2023},{"n":11,"year":2024},{"n":2,"year":2025},{"n":29,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T13:41:19.045007+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T13:31:10.706990+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Fast Transformer Decoding: One Write-Head is All You Need","claims":[{"claim_text":"Multi-head attention layers, as used in the Transformer neural sequence model, are a powerful alternative to RNNs for moving information across and between sequences. While training these layers is generally fast and simple, due to parallelizability across the length of the sequence, incremental inference (where such paralleization is impossible) is often slow, due to the memory-bandwidth cost of repeatedly loading the large \"keys\" and \"values\" tensors. We propose a variant called multi-query attention, where the keys and values are shared across all of the different attention \"heads\", greatly","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Fast Transformer Decoding: One Write-Head is All You Need because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T13:41:08.864774+00:00"}},"summary":{"title":"Fast Transformer Decoding: One Write-Head is All You Need","claims":[{"claim_text":"Multi-head attention layers, as used in the Transformer neural sequence model, are a powerful alternative to RNNs for moving information across and between sequences. While training these layers is generally fast and simple, due to parallelizability across the length of the sequence, incremental inference (where such paralleization is impossible) is often slow, due to the memory-bandwidth cost of repeatedly loading the large \"keys\" and \"values\" tensors. We propose a variant called multi-query attention, where the keys and values are shared across all of the different attention \"heads\", greatly","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Fast Transformer Decoding: One Write-Head is All You Need because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints","work_id":"b73ad5b2-e553-4c71-b0c9-67e67ba7b158","shared_citers":19},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":13},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":12},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":11},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":11},{"title":"Longformer: The Long-Document Transformer","work_id":"abea7a44-6668-4de7-aab6-f53a6e5aa088","shared_citers":10},{"title":"RoFormer: Enhanced Transformer with Rotary Position Embedding","work_id":"4e5eee26-cd04-4c7a-988f-3e6d1a1f0eb9","shared_citers":10},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":9},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":9},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":9},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":9},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":9},{"title":"Efficient Streaming Language Models with Attention Sinks","work_id":"a8d25452-c237-48c9-88a4-682717c3979a","shared_citers":8},{"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","shared_citers":8},{"title":"Linformer: Self-Attention with Linear Complexity","work_id":"4b717b51-6098-45d0-8e9e-b69bef651bc3","shared_citers":8},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":8},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":8},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":8},{"title":"GLU Variants Improve Transformer","work_id":"17d0763c-1016-41ab-a478-478e890765eb","shared_citers":7},{"title":"GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers","work_id":"19ed8c44-202a-48f6-8169-637d5a5f2408","shared_citers":7},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":7},{"title":"DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model","work_id":"1e1df141-cac8-47fd-b068-c4c96e51e331","shared_citers":6},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":6},{"title":"Generating Long Sequences with Sparse Transformers","work_id":"c5b81688-45ee-4a9a-b095-e6290f45cb6c","shared_citers":6}],"time_series":[{"n":1,"year":2020},{"n":2,"year":2022},{"n":6,"year":2023},{"n":11,"year":2024},{"n":2,"year":2025},{"n":29,"year":2026}],"dependency_candidates":[]},"authors":[]}}