{"work":{"id":"f93e08bf-9e96-409b-8ac6-b8385fd17fd7","openalex_id":null,"doi":null,"arxiv_id":"2503.19786","raw_key":null,"title":"Gemma 3 Technical Report","authors":null,"authors_text":"Gemma Team","year":2025,"venue":"cs.CL","abstract":"We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achieved by increasing the ratio of local to global attention layers, and keeping the span on local attention short. The Gemma 3 models are trained with distillation and achieve superior performance to Gemma 2 for both pre-trained and instruction finetuned versions. In particular, our novel post-training recipe significantly improves the math, chat, instruction-following and multilingual abilities, making Gemma3-4B-IT competitive with Gemma2-27B-IT and Gemma3-27B-IT comparable to Gemini-1.5-Pro across benchmarks. We release all our models to the community.","external_url":"https://arxiv.org/abs/2503.19786","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T08:15:34.009996+00:00","pith_arxiv_id":"2503.19786","created_at":"2026-05-08T20:14:08.829658+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":false,"display_title":"Gemma 3 Technical Report","render_title":"Gemma 3 Technical Report"},"hub":{"state":{"work_id":"f93e08bf-9e96-409b-8ac6-b8385fd17fd7","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":352,"external_cited_by_count":null,"distinct_field_count":26,"first_pith_cited_at":"2024-10-16T20:51:42+00:00","last_pith_cited_at":"2026-05-22T17:56:37+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-06T21:31:02.595887+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":49},{"context_role":"baseline","n":10},{"context_role":"other","n":4},{"context_role":"dataset","n":3},{"context_role":"method","n":2}],"polarity_counts":[{"context_polarity":"background","n":47},{"context_polarity":"baseline","n":10},{"context_polarity":"unclear","n":6},{"context_polarity":"use_dataset","n":3},{"context_polarity":"use_method","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Gemma 3 Technical Report","claims":[{"claim_text":"We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achieved by increasing the ratio of local to global attention layers, and keeping the span on local attention short. The Gemma 3 models are trained with distillation and achieve superior performance to Gem","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 3 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T21:03:37.287304+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"27d108d1-4381-4e6a-9808-1ef8cbd36872","orcid":null,"display_name":"Gemma Team"}]},"error":null,"updated_at":"2026-05-13T20:53:34.983964+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T20:53:36.373922+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":71},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":64},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":28},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":25},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":22},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":19},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":19},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":18},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":17},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":17},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":16},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":16},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":15},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":13},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":12},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":11},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":11},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":11},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":11},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":11},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":11},{"title":"Gemma 2: Improving Open Language Models at a Practical Size","work_id":"4dd94e2f-2b27-4cbf-88a0-4910f0772a57","shared_citers":10},{"title":"Language Models (Mostly) Know What They Know","work_id":"8ca58a10-da41-4f70-baae-7e449512e345","shared_citers":9},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":9}],"time_series":[{"n":3,"year":2025},{"n":170,"year":2026}]},"error":null,"updated_at":"2026-05-13T20:53:35.387282+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T20:53:34.751395+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Gemma 3 Technical Report","claims":[{"claim_text":"We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achieved by increasing the ratio of local to global attention layers, and keeping the span on local attention short. The Gemma 3 models are trained with distillation and achieve superior performance to Gem","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 3 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T20:53:34.986131+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Gemma 3 Technical Report","claims":[{"claim_text":"We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achieved by increasing the ratio of local to global attention layers, and keeping the span on local attention short. The Gemma 3 models are trained with distillation and achieve superior performance to Gem","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 3 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T20:53:32.300495+00:00"}},"summary":{"title":"Gemma 3 Technical Report","claims":[{"claim_text":"We introduce Gemma 3, a multimodal addition to the Gemma family of lightweight open models, ranging in scale from 1 to 27 billion parameters. This version introduces vision understanding abilities, a wider coverage of languages and longer context - at least 128K tokens. We also change the architecture of the model to reduce the KV-cache memory that tends to explode with long context. This is achieved by increasing the ratio of local to global attention layers, and keeping the span on local attention short. The Gemma 3 models are trained with distillation and achieve superior performance to Gem","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Gemma 3 Technical Report because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":71},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":64},{"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","shared_citers":28},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":25},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":22},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":19},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":19},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":18},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":17},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":17},{"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","shared_citers":16},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":16},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":15},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":13},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":12},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":11},{"title":"Gemini: A Family of Highly Capable Multimodal Models","work_id":"83f7c85b-3f11-450f-ac0c-64d9745220b2","shared_citers":11},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":11},{"title":"InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency","work_id":"b8f5e260-fff5-444e-bcf5-2c42cfefd83d","shared_citers":11},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":11},{"title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution","work_id":"8abcfe4f-e0fb-44b7-9123-448fac95f90a","shared_citers":11},{"title":"Gemma 2: Improving Open Language Models at a Practical Size","work_id":"4dd94e2f-2b27-4cbf-88a0-4910f0772a57","shared_citers":10},{"title":"Language Models (Mostly) Know What They Know","work_id":"8ca58a10-da41-4f70-baae-7e449512e345","shared_citers":9},{"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","shared_citers":9}],"time_series":[{"n":3,"year":2025},{"n":170,"year":2026}]},"authors":[{"id":"27d108d1-4381-4e6a-9808-1ef8cbd36872","orcid":null,"display_name":"Gemma Team","source":"manual","import_confidence":0.72}]}}