{"work":{"id":"4dd94e2f-2b27-4cbf-88a0-4910f0772a57","openalex_id":null,"doi":null,"arxiv_id":"2408.00118","raw_key":null,"title":"Gemma 2: Improving Open Language Models at a Practical Size","authors":null,"authors_text":"Gemma Team: Morgane Riviere, Shreya Pathak, Pier Giuseppe Sessa, Cassidy Hardin, Surya Bhupatiraju, L\\'eonard Hussenot","year":2024,"venue":"cs.CL","abstract":"In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community.","external_url":"https://arxiv.org/abs/2408.00118","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T12:23:24.481670+00:00","pith_arxiv_id":"2408.00118","created_at":"2026-05-09T05:45:22.428099+00:00","updated_at":"2026-06-29T12:23:24.481670+00:00","title_quality_ok":true,"display_title":"Gemma 2: Improving Open Language Models at a Practical Size","render_title":"Gemma 2: Improving Open Language Models at a Practical Size"},"hub":{"state":{"work_id":"4dd94e2f-2b27-4cbf-88a0-4910f0772a57","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":230,"external_cited_by_count":null,"distinct_field_count":18,"first_pith_cited_at":"2023-03-31T17:28:46+00:00","last_pith_cited_at":"2026-06-16T15:33:49+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T12:28:45.746418+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":23},{"context_role":"method","n":6},{"context_role":"baseline","n":2},{"context_role":"dataset","n":1},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":21},{"context_polarity":"use_method","n":6},{"context_polarity":"unclear","n":3},{"context_polarity":"baseline","n":2},{"context_polarity":"use_dataset","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Gemma 2: Improving Open Language Models at a Practical Size","claims":[{"claim_text":"In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer compe","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 2: Improving Open Language Models at a Practical Size because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T01:14:12.617581+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"4ae7be16-33f4-4ede-bb46-caaaf2d155cd","orcid":null,"display_name":"Gemma Team: Morgane Riviere"},{"id":"b3928e5b-c66a-4e05-bbaf-ec0fd8a77930","orcid":null,"display_name":"Shreya Pathak"},{"id":"a4162185-272d-4479-8134-de3e840a2bb6","orcid":null,"display_name":"Pier Giuseppe Sessa"},{"id":"7ac1c3f7-453d-4f58-96bc-86ed4727d11d","orcid":null,"display_name":"Cassidy Hardin"},{"id":"9bb70fa2-ecf3-4959-b786-94ce7ef2b6ca","orcid":null,"display_name":"Surya Bhupatiraju"},{"id":"c7a56105-2dfc-4175-9ca3-4e0040621d65","orcid":null,"display_name":"L\\'eonard Hussenot"}]},"error":null,"updated_at":"2026-05-14T01:14:12.611504+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T01:14:07.070825+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":50},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":33},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":28},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":27},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":18},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":16},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":14},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":14},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":11},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":11},{"title":"Qwen2 Technical Report","work_id":"a1857881-ab9b-4b80-9b5f-9ae4b5c2566d","shared_citers":11},{"title":"Gemma 3 Technical Report","work_id":"f93e08bf-9e96-409b-8ac6-b8385fd17fd7","shared_citers":10},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":9},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":9},{"title":"Sparse Autoencoders Find Highly Interpretable Features in Language Models","work_id":"51960d72-c69f-4db8-8efd-e90e8b4d9524","shared_citers":9},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":8},{"title":"Gemma: Open Models Based on Gemini Research and Technology","work_id":"a9ea2870-df28-40b8-a9e0-a7e9a116f793","shared_citers":8},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":8},{"title":"Phi-4 Technical Report","work_id":"b6274271-7af9-4ee8-993b-ba1ba4205ba8","shared_citers":8},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":8},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":8},{"title":"Representation Engineering: A Top-Down Approach to AI Transparency","work_id":"45b326e2-e962-41a5-a542-2559e103a19b","shared_citers":8},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":8},{"title":"Steering Language Models With Activation Engineering","work_id":"d525fe06-5560-4e97-86fc-7a0e551f5b17","shared_citers":8}],"time_series":[{"n":1,"year":2023},{"n":3,"year":2024},{"n":7,"year":2025},{"n":102,"year":2026}]},"error":null,"updated_at":"2026-05-14T01:14:18.076892+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T01:14:02.518507+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Gemma 2: Improving Open Language Models at a Practical Size","claims":[{"claim_text":"In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer compe","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 2: Improving Open Language Models at a Practical Size because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T01:14:02.529573+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Gemma 2: Improving Open Language Models at a Practical Size","claims":[{"claim_text":"In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer compe","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 2: Improving Open Language Models at a Practical Size because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T01:14:12.626112+00:00"}},"summary":{"title":"Gemma 2: Improving Open Language Models at a Practical Size","claims":[{"claim_text":"In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer compe","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 2: Improving Open Language Models at a Practical Size because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":50},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":33},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":28},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":27},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":18},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":16},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":14},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":14},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":11},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":11},{"title":"Qwen2 Technical Report","work_id":"a1857881-ab9b-4b80-9b5f-9ae4b5c2566d","shared_citers":11},{"title":"Gemma 3 Technical Report","work_id":"f93e08bf-9e96-409b-8ac6-b8385fd17fd7","shared_citers":10},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":9},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":9},{"title":"Sparse Autoencoders Find Highly Interpretable Features in Language Models","work_id":"51960d72-c69f-4db8-8efd-e90e8b4d9524","shared_citers":9},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":8},{"title":"Gemma: Open Models Based on Gemini Research and Technology","work_id":"a9ea2870-df28-40b8-a9e0-a7e9a116f793","shared_citers":8},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":8},{"title":"Phi-4 Technical Report","work_id":"b6274271-7af9-4ee8-993b-ba1ba4205ba8","shared_citers":8},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":8},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":8},{"title":"Representation Engineering: A Top-Down Approach to AI Transparency","work_id":"45b326e2-e962-41a5-a542-2559e103a19b","shared_citers":8},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":8},{"title":"Steering Language Models With Activation Engineering","work_id":"d525fe06-5560-4e97-86fc-7a0e551f5b17","shared_citers":8}],"time_series":[{"n":1,"year":2023},{"n":3,"year":2024},{"n":7,"year":2025},{"n":102,"year":2026}]},"authors":[{"id":"7ac1c3f7-453d-4f58-96bc-86ed4727d11d","orcid":null,"display_name":"Cassidy Hardin","source":"manual","import_confidence":0.72},{"id":"4ae7be16-33f4-4ede-bb46-caaaf2d155cd","orcid":null,"display_name":"Gemma Team: Morgane Riviere","source":"manual","import_confidence":0.72},{"id":"c7a56105-2dfc-4175-9ca3-4e0040621d65","orcid":null,"display_name":"L\\'eonard Hussenot","source":"manual","import_confidence":0.72},{"id":"a4162185-272d-4479-8134-de3e840a2bb6","orcid":null,"display_name":"Pier Giuseppe Sessa","source":"manual","import_confidence":0.72},{"id":"b3928e5b-c66a-4e05-bbaf-ec0fd8a77930","orcid":null,"display_name":"Shreya Pathak","source":"manual","import_confidence":0.72},{"id":"9bb70fa2-ecf3-4959-b786-94ce7ef2b6ca","orcid":null,"display_name":"Surya Bhupatiraju","source":"manual","import_confidence":0.72}]}}