{"work":{"id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","openalex_id":null,"doi":null,"arxiv_id":"2403.07974","raw_key":null,"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","authors":null,"authors_text":"Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang","year":2024,"venue":"cs.SE","abstract":"Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our benchmark also focuses on a broader range of code related capabilities, such as self-repair, code execution, and test output prediction, beyond just code generation. Currently, LiveCodeBench hosts four hundred high-quality coding problems that were published between May 2023 and May 2024. We have evaluated 18 base LLMs and 34 instruction-tuned LLMs on LiveCodeBench. We present empirical findings on contamination, holistic performance comparisons, potential overfitting in existing benchmarks as well as individual model comparisons. We will release all prompts and model completions for further community analysis, along with a general toolkit for adding new scenarios and model","external_url":"https://arxiv.org/abs/2403.07974","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T05:55:24.596338+00:00","pith_arxiv_id":"2403.07974","created_at":"2026-05-09T01:29:32.394427+00:00","updated_at":"2026-05-25T05:55:24.596338+00:00","title_quality_ok":true,"display_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","render_title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code"},"hub":{"state":{"work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":181,"external_cited_by_count":null,"distinct_field_count":10,"first_pith_cited_at":"2024-02-15T02:24:46+00:00","last_pith_cited_at":"2026-05-22T14:09:41+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-01T14:53:29.455820+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"dataset","n":24},{"context_role":"background","n":16},{"context_role":"contradiction","n":1},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"use_dataset","n":23},{"context_polarity":"background","n":17},{"context_polarity":"contest","n":1},{"context_polarity":"unclear","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","claims":[{"claim_text":"Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our benchma","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T19:56:20.296141+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"e12e7e3c-f437-456e-9e86-8dec2620869b","orcid":null,"display_name":"Naman Jain"},{"id":"14e0d353-3669-4148-907e-d598fb18553b","orcid":null,"display_name":"King Han"},{"id":"048258f2-2798-4076-8794-698b27444f16","orcid":null,"display_name":"Alex Gu"},{"id":"f3d7bde5-d083-4a3d-adc7-b2b3cbcc56fa","orcid":null,"display_name":"Wen-Ding Li"},{"id":"791a4ae3-f85f-446e-be3b-2553a9e365b0","orcid":null,"display_name":"Fanjia Yan"},{"id":"3af63c1b-861c-481d-b7ed-5864bcb60b2a","orcid":null,"display_name":"Tianjun Zhang"}]},"error":null,"updated_at":"2026-05-14T19:56:21.212080+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T06:17:24.878832+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":37},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":36},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":36},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":32},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":29},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":25},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":22},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":21},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":19},{"title":"Qwen2.5-Coder Technical Report","work_id":"09ba463d-6377-4017-9801-444ffb94b056","shared_citers":16},{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","shared_citers":16},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":14},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":14},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":13},{"title":"GPQA: A Graduate-Level Google-Proof Q&A Benchmark","work_id":"9e2a976b-f5ad-4aee-af5c-243fe0fe75d2","shared_citers":13},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":13},{"title":"Instruction-Following Evaluation for Large Language Models","work_id":"3aa06177-125a-4f5a-8f4a-8070c5986c26","shared_citers":13},{"title":"Code Llama: Open Foundation Models for Code","work_id":"e73bffa4-7620-47ac-9327-259a60db52ca","shared_citers":11},{"title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence","work_id":"f22dae5a-27e2-41d0-a061-c4286418dee3","shared_citers":11},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":11},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":11},{"title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","work_id":"07c85cc5-4086-4abc-823b-6d0f4ff784d0","shared_citers":10},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":10},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":10}],"time_series":[{"n":3,"year":2024},{"n":9,"year":2025},{"n":83,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T06:17:17.180047+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T06:17:04.385666+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","claims":[{"claim_text":"Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our benchma","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T19:56:21.216205+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","claims":[{"claim_text":"Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our benchma","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T06:17:17.188902+00:00"}},"summary":{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","claims":[{"claim_text":"Large Language Models (LLMs) applied to code-related applications have emerged as a prominent field, attracting significant interest from both academia and industry. However, as new and improved LLMs are developed, existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient for assessing their capabilities. In this work, we propose LiveCodeBench, a comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our benchma","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":37},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":36},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":36},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":32},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":29},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":25},{"title":"Measuring Mathematical Problem Solving With the MATH Dataset","work_id":"50652ac6-fb7c-4675-a2c2-159c241feb17","shared_citers":22},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":21},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":19},{"title":"Qwen2.5-Coder Technical Report","work_id":"09ba463d-6377-4017-9801-444ffb94b056","shared_citers":16},{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","shared_citers":16},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":14},{"title":"Group Sequence Policy Optimization","work_id":"3a98b53b-9f52-4d95-adf7-89353c0a9a65","shared_citers":14},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":13},{"title":"GPQA: A Graduate-Level Google-Proof Q&A Benchmark","work_id":"9e2a976b-f5ad-4aee-af5c-243fe0fe75d2","shared_citers":13},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":13},{"title":"Instruction-Following Evaluation for Large Language Models","work_id":"3aa06177-125a-4f5a-8f4a-8070c5986c26","shared_citers":13},{"title":"Code Llama: Open Foundation Models for Code","work_id":"e73bffa4-7620-47ac-9327-259a60db52ca","shared_citers":11},{"title":"DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence","work_id":"f22dae5a-27e2-41d0-a061-c4286418dee3","shared_citers":11},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":11},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":11},{"title":"DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models","work_id":"07c85cc5-4086-4abc-823b-6d0f4ff784d0","shared_citers":10},{"title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","work_id":"008df105-2fdd-45d8-857a-8e35868aecb6","shared_citers":10},{"title":"gpt-oss-120b & gpt-oss-20b Model Card","work_id":"178c1f7e-4f19-4392-a45d-45a6dfa88ead","shared_citers":10}],"time_series":[{"n":3,"year":2024},{"n":9,"year":2025},{"n":83,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"048258f2-2798-4076-8794-698b27444f16","orcid":null,"display_name":"Alex Gu","source":"manual","import_confidence":0.72},{"id":"791a4ae3-f85f-446e-be3b-2553a9e365b0","orcid":null,"display_name":"Fanjia Yan","source":"manual","import_confidence":0.72},{"id":"14e0d353-3669-4148-907e-d598fb18553b","orcid":null,"display_name":"King Han","source":"manual","import_confidence":0.72},{"id":"e12e7e3c-f437-456e-9e86-8dec2620869b","orcid":null,"display_name":"Naman Jain","source":"manual","import_confidence":0.72},{"id":"3af63c1b-861c-481d-b7ed-5864bcb60b2a","orcid":null,"display_name":"Tianjun Zhang","source":"manual","import_confidence":0.72},{"id":"f3d7bde5-d083-4a3d-adc7-b2b3cbcc56fa","orcid":null,"display_name":"Wen-Ding Li","source":"manual","import_confidence":0.72}]}}