{"work":{"id":"17d0763c-1016-41ab-a478-478e890765eb","openalex_id":null,"doi":null,"arxiv_id":"2002.05202","raw_key":null,"title":"GLU Variants Improve Transformer","authors":null,"authors_text":"Noam Shazeer","year":2020,"venue":"cs.LG","abstract":"Gated Linear Units (arXiv:1612.08083) consist of the component-wise product of two linear projections, one of which is first passed through a sigmoid function. Variations on GLU are possible, using different nonlinear (or even linear) functions in place of sigmoid. We test these variants in the feed-forward sublayers of the Transformer (arXiv:1706.03762) sequence-to-sequence model, and find that some of them yield quality improvements over the typically-used ReLU or GELU activations.","external_url":"https://arxiv.org/abs/2002.05202","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T14:13:30.408760+00:00","pith_arxiv_id":"2002.05202","created_at":"2026-05-09T04:17:20.279279+00:00","updated_at":"2026-06-29T14:13:30.408760+00:00","title_quality_ok":false,"display_title":"GLU Variants Improve Transformer","render_title":"GLU Variants Improve Transformer"},"hub":{"state":{"work_id":"17d0763c-1016-41ab-a478-478e890765eb","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":213,"external_cited_by_count":null,"distinct_field_count":22,"first_pith_cited_at":"2020-02-10T18:55:58+00:00","last_pith_cited_at":"2026-06-25T10:44:48+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T14:08:56.711080+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":30},{"context_role":"method","n":24},{"context_role":"dataset","n":2},{"context_role":"extension","n":1}],"polarity_counts":[{"context_polarity":"background","n":27},{"context_polarity":"use_method","n":23},{"context_polarity":"unclear","n":4},{"context_polarity":"use_dataset","n":2},{"context_polarity":"extend","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"GLU Variants Improve Transformer","claims":[{"claim_text":"Gated Linear Units (arXiv:1612.08083) consist of the component-wise product of two linear projections, one of which is first passed through a sigmoid function. Variations on GLU are possible, using different nonlinear (or even linear) functions in place of sigmoid. We test these variants in the feed-forward sublayers of the Transformer (arXiv:1706.03762) sequence-to-sequence model, and find that some of them yield quality improvements over the typically-used ReLU or GELU activations.","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GLU Variants Improve Transformer because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T01:54:19.386374+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"d555cf92-dbb8-4cd6-b9cc-3a82d85183de","orcid":null,"display_name":"Noam Shazeer"}]},"error":null,"updated_at":"2026-05-14T01:54:02.734064+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T01:54:08.788023+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":26},{"title":"RoFormer: Enhanced Transformer with Rotary Position Embedding","work_id":"4e5eee26-cd04-4c7a-988f-3e6d1a1f0eb9","shared_citers":20},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":19},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":19},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":18},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":18},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":17},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":16},{"title":"Gaussian Error Linear Units (GELUs)","work_id":"0466fd22-03a1-4a61-af0a-a900e77bb023","shared_citers":16},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":16},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":16},{"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","shared_citers":14},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":13},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":12},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":12},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":11},{"title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints","work_id":"b73ad5b2-e553-4c71-b0c9-67e67ba7b158","shared_citers":11},{"title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","work_id":"4ee75248-1199-492c-a52f-6661e0f4adff","shared_citers":11},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":11},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":11},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":11},{"title":"Attention Is All You Need","work_id":"baafb5a2-5272-43bc-932f-09fa9ffe5316","shared_citers":10},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":10},{"title":"doi: 10.18653/v1/D18-2012","work_id":"81a6320b-c2e1-4d74-a03e-9e1ff6bbed8d","shared_citers":10}],"time_series":[{"n":1,"year":2021},{"n":3,"year":2022},{"n":5,"year":2023},{"n":14,"year":2024},{"n":6,"year":2025},{"n":82,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T01:54:14.718682+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T01:54:23.941404+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"GLU Variants Improve Transformer","claims":[{"claim_text":"Gated Linear Units (arXiv:1612.08083) consist of the component-wise product of two linear projections, one of which is first passed through a sigmoid function. Variations on GLU are possible, using different nonlinear (or even linear) functions in place of sigmoid. We test these variants in the feed-forward sublayers of the Transformer (arXiv:1706.03762) sequence-to-sequence model, and find that some of them yield quality improvements over the typically-used ReLU or GELU activations.","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GLU Variants Improve Transformer because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T01:54:23.946056+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GLU Variants Improve Transformer","claims":[{"claim_text":"Gated Linear Units (arXiv:1612.08083) consist of the component-wise product of two linear projections, one of which is first passed through a sigmoid function. Variations on GLU are possible, using different nonlinear (or even linear) functions in place of sigmoid. We test these variants in the feed-forward sublayers of the Transformer (arXiv:1706.03762) sequence-to-sequence model, and find that some of them yield quality improvements over the typically-used ReLU or GELU activations.","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GLU Variants Improve Transformer because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T01:54:02.474745+00:00"}},"summary":{"title":"GLU Variants Improve Transformer","claims":[{"claim_text":"Gated Linear Units (arXiv:1612.08083) consist of the component-wise product of two linear projections, one of which is first passed through a sigmoid function. Variations on GLU are possible, using different nonlinear (or even linear) functions in place of sigmoid. We test these variants in the feed-forward sublayers of the Transformer (arXiv:1706.03762) sequence-to-sequence model, and find that some of them yield quality improvements over the typically-used ReLU or GELU activations.","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GLU Variants Improve Transformer because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":26},{"title":"RoFormer: Enhanced Transformer with Rotary Position Embedding","work_id":"4e5eee26-cd04-4c7a-988f-3e6d1a1f0eb9","shared_citers":20},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":19},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":19},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":18},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":18},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":17},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":16},{"title":"Gaussian Error Linear Units (GELUs)","work_id":"0466fd22-03a1-4a61-af0a-a900e77bb023","shared_citers":16},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":16},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":16},{"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","shared_citers":14},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":13},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":12},{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":12},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":11},{"title":"GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints","work_id":"b73ad5b2-e553-4c71-b0c9-67e67ba7b158","shared_citers":11},{"title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","work_id":"4ee75248-1199-492c-a52f-6661e0f4adff","shared_citers":11},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":11},{"title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","work_id":"c888e6d1-0b1d-43d6-9ef5-f0912a0efa1b","shared_citers":11},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":11},{"title":"Attention Is All You Need","work_id":"baafb5a2-5272-43bc-932f-09fa9ffe5316","shared_citers":10},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":10},{"title":"doi: 10.18653/v1/D18-2012","work_id":"81a6320b-c2e1-4d74-a03e-9e1ff6bbed8d","shared_citers":10}],"time_series":[{"n":1,"year":2021},{"n":3,"year":2022},{"n":5,"year":2023},{"n":14,"year":2024},{"n":6,"year":2025},{"n":82,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"d555cf92-dbb8-4cd6-b9cc-3a82d85183de","orcid":null,"display_name":"Noam Shazeer","source":"manual","import_confidence":0.72}]}}