{"work":{"id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","openalex_id":null,"doi":null,"arxiv_id":"2310.06825","raw_key":null,"title":"Mistral 7B","authors":null,"authors_text":"Mistral 7B , author =","year":2023,"venue":"cs.CL","abstract":"We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are released under the Apache 2.0 license.","external_url":"https://arxiv.org/abs/2310.06825","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T14:03:29.938422+00:00","pith_arxiv_id":"2310.06825","created_at":"2026-05-09T01:54:34.002860+00:00","updated_at":"2026-06-29T14:03:29.938422+00:00","title_quality_ok":false,"display_title":"Mistral 7B","render_title":"Mistral 7B"},"hub":{"state":{"work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":496,"external_cited_by_count":null,"distinct_field_count":29,"first_pith_cited_at":"2023-05-02T05:46:48+00:00","last_pith_cited_at":"2026-06-26T13:03:29+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T14:48:57.623872+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":57},{"context_role":"method","n":15},{"context_role":"baseline","n":10},{"context_role":"other","n":6},{"context_role":"dataset","n":2}],"polarity_counts":[{"context_polarity":"background","n":55},{"context_polarity":"use_method","n":15},{"context_polarity":"baseline","n":10},{"context_polarity":"unclear","n":8},{"context_polarity":"use_dataset","n":2}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"Mistral 7B","claims":[{"claim_text":"We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and auto","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Mistral 7B because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:36:23.099646+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"103eb511-18f6-49d9-aaaf-8de411df2cd4","orcid":null,"display_name":"Mistral 7B"},{"id":"f070c5b6-0f85-4f52-9b85-94ba37034e53","orcid":null,"display_name":"author ="}]},"error":null,"updated_at":"2026-05-14T18:36:33.300322+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:36:23.092655+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":96},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":66},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":55},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":47},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":44},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":41},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":39},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":28},{"title":"Gemma: Open Models Based on Gemini Research and Technology","work_id":"a9ea2870-df28-40b8-a9e0-a7e9a116f793","shared_citers":28},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":28},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":26},{"title":"LoRA: Low-Rank Adaptation of Large Language Models","work_id":"0426219a-789e-4964-adc8-a04538510818","shared_citers":23},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":22},{"title":"Qwen2 Technical Report","work_id":"a1857881-ab9b-4b80-9b5f-9ae4b5c2566d","shared_citers":20},{"title":"Gemma 2: Improving Open Language Models at a Practical Size","work_id":"4dd94e2f-2b27-4cbf-88a0-4910f0772a57","shared_citers":18},{"title":"Mixtral of Experts","work_id":"0de8c352-9daa-4e1e-8c7b-3d0dec69f369","shared_citers":18},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":17},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":17},{"title":"Gemma 3 Technical Report","work_id":"f93e08bf-9e96-409b-8ac6-b8385fd17fd7","shared_citers":17},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":16},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":16},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":16},{"title":"Code Llama: Open Foundation Models for Code","work_id":"e73bffa4-7620-47ac-9327-259a60db52ca","shared_citers":15},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":15}],"time_series":[{"n":2,"year":2023},{"n":19,"year":2024},{"n":3,"year":2025},{"n":209,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:36:37.837151+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:36:42.123401+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"Mistral 7B","claims":[{"claim_text":"We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and auto","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Mistral 7B because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:36:27.625157+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Mistral 7B","claims":[{"claim_text":"We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and auto","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Mistral 7B because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:36:33.306773+00:00"}},"summary":{"title":"Mistral 7B","claims":[{"claim_text":"We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that surpasses the Llama 2 13B -- Chat model both on human and auto","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Mistral 7B because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":96},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":66},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":55},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":47},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":44},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":41},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":39},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":28},{"title":"Gemma: Open Models Based on Gemini Research and Technology","work_id":"a9ea2870-df28-40b8-a9e0-a7e9a116f793","shared_citers":28},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":28},{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":26},{"title":"LoRA: Low-Rank Adaptation of Large Language Models","work_id":"0426219a-789e-4964-adc8-a04538510818","shared_citers":23},{"title":"Qwen Technical Report","work_id":"bb1fd52f-6b2f-437c-9516-37bdf6eb9be8","shared_citers":22},{"title":"Qwen2 Technical Report","work_id":"a1857881-ab9b-4b80-9b5f-9ae4b5c2566d","shared_citers":20},{"title":"Gemma 2: Improving Open Language Models at a Practical Size","work_id":"4dd94e2f-2b27-4cbf-88a0-4910f0772a57","shared_citers":18},{"title":"Mixtral of Experts","work_id":"0de8c352-9daa-4e1e-8c7b-3d0dec69f369","shared_citers":18},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":17},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":17},{"title":"Gemma 3 Technical Report","work_id":"f93e08bf-9e96-409b-8ac6-b8385fd17fd7","shared_citers":17},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":16},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":16},{"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","shared_citers":16},{"title":"Code Llama: Open Foundation Models for Code","work_id":"e73bffa4-7620-47ac-9327-259a60db52ca","shared_citers":15},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":15}],"time_series":[{"n":2,"year":2023},{"n":19,"year":2024},{"n":3,"year":2025},{"n":209,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"f070c5b6-0f85-4f52-9b85-94ba37034e53","orcid":null,"display_name":"author =","source":"manual","import_confidence":0.72},{"id":"103eb511-18f6-49d9-aaaf-8de411df2cd4","orcid":null,"display_name":"Mistral 7B","source":"manual","import_confidence":0.72}]}}