{"work":{"id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","openalex_id":null,"doi":null,"arxiv_id":"2310.06770","raw_key":null,"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","authors":null,"authors_text":"Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press","year":2023,"venue":"cs.CL","abstract":"Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a language model is tasked with editing the codebase to address the issue. Resolving issues in SWE-bench frequently requires understanding and coordinating changes across multiple functions, classes, and even files simultaneously, calling for models to interact with execution environments, process extremely long contexts and perform complex reasoning that goes far beyond traditional code generation tasks. Our evaluations show that both state-of-the-art proprietary models and our fine-tuned model SWE-Llama can resolve only the simplest issues. The best-performing model, Claude 2, is able to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps towards LMs that are more practical, intelligent, and autonomous.","external_url":"https://arxiv.org/abs/2310.06770","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T07:15:27.742382+00:00","pith_arxiv_id":"2310.06770","created_at":"2026-05-09T06:15:37.605874+00:00","updated_at":"2026-05-25T07:15:27.742382+00:00","title_quality_ok":true,"display_title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","render_title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?"},"hub":{"state":{"work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":257,"external_cited_by_count":null,"distinct_field_count":18,"first_pith_cited_at":"2023-05-02T05:46:48+00:00","last_pith_cited_at":"2026-05-22T01:18:39+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-27T08:17:33.554708+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":56},{"context_role":"dataset","n":10},{"context_role":"baseline","n":3},{"context_role":"method","n":3}],"polarity_counts":[{"context_polarity":"background","n":59},{"context_polarity":"use_dataset","n":6},{"context_polarity":"baseline","n":3},{"context_polarity":"use_method","n":3},{"context_polarity":"support","n":1}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","claims":[{"claim_text":"Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks SWE-bench: Can Language Models Resolve Real-World GitHub Issues? because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T23:33:56.189523+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"af0b1ab2-1ba3-4c41-a2ae-af68600d9140","orcid":null,"display_name":"Carlos E. Jimenez"},{"id":"7f1034d5-7758-4c81-addf-00ad64bdf933","orcid":null,"display_name":"John Yang"},{"id":"ccce1405-14be-4569-a696-5ab6460df1ea","orcid":null,"display_name":"Alexander Wettig"},{"id":"55ffc739-fb29-4c94-90fd-a74183bd62cf","orcid":null,"display_name":"Shunyu Yao"},{"id":"fb7620d1-d79e-4ca8-82cd-583be4738edb","orcid":null,"display_name":"Kexin Pei"},{"id":"f5def1ac-a050-43b1-8ab9-44fb276b2556","orcid":null,"display_name":"Ofir Press"}]},"error":null,"updated_at":"2026-05-13T23:33:57.109238+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-13T23:24:01.814025+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":43},{"title":"WebArena: A Realistic Web Environment for Building Autonomous Agents","work_id":"7058ffd2-a339-4102-89eb-248eeb074652","shared_citers":28},{"title":"AgentBench: Evaluating LLMs as Agents","work_id":"a37549b4-4c94-412d-acc4-4efeb08509be","shared_citers":26},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":23},{"title":"$\\tau$-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains","work_id":"6a8d8dc4-0cc0-4052-8109-abbcdcd4a962","shared_citers":20},{"title":"OpenHands: An Open Platform for AI Software Developers as Generalist Agents","work_id":"f1762ea0-e382-4f38-a28c-adc643789859","shared_citers":20},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":19},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":18},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":16},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":16},{"title":"ReAct: Synergizing Reasoning and Acting in Language Models","work_id":"407a2351-25f1-497d-b611-f77d0292a8e6","shared_citers":15},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":14},{"title":"Terminal-Bench: Benchmarking Agents on Hard, Realistic Tasks in Command Line Interfaces","work_id":"0624be05-1d97-4fd6-8300-b04b8a3ab04b","shared_citers":13},{"title":"ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs","work_id":"3c555b48-a4d9-42dd-9fdd-0f6018fbe9cb","shared_citers":13},{"title":"Voyager: An Open-Ended Embodied Agent with Large Language Models","work_id":"ffe0d207-86cf-4742-a100-e988ac8b9676","shared_citers":11},{"title":"Agentless: Demystifying LLM-based Software Engineering Agents","work_id":"71c901c4-3c83-4e10-af54-3daef7fff397","shared_citers":10},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":10},{"title":"SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering","work_id":"01826cd9-a652-403c-a2ec-531da9fe2b6a","shared_citers":10},{"title":"SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?","work_id":"a561c78a-4b02-4053-a92a-bc5c7c5f6b9b","shared_citers":10},{"title":"Teaching Large Language Models to Self-Debug","work_id":"cdfb2680-220c-44eb-9edd-867b75fb821d","shared_citers":10},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":9},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":9},{"title":"Identifying the Risks of LM Agents with an LM-Emulated Sandbox","work_id":"3d4c3b66-d749-4939-b1bc-62b10b2ebbb6","shared_citers":9},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":9}],"time_series":[{"n":6,"year":2024},{"n":4,"year":2025},{"n":130,"year":2026}]},"error":null,"updated_at":"2026-05-13T23:34:10.264967+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"fixed":1,"items":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-13T23:24:06.373420+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","claims":[{"claim_text":"Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks SWE-bench: Can Language Models Resolve Real-World GitHub Issues? because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T23:24:06.385054+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","claims":[{"claim_text":"Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks SWE-bench: Can Language Models Resolve Real-World GitHub Issues? because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-13T23:23:58.042181+00:00"}},"summary":{"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","claims":[{"claim_text":"Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks SWE-bench: Can Language Models Resolve Real-World GitHub Issues? because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","shared_citers":43},{"title":"WebArena: A Realistic Web Environment for Building Autonomous Agents","work_id":"7058ffd2-a339-4102-89eb-248eeb074652","shared_citers":28},{"title":"AgentBench: Evaluating LLMs as Agents","work_id":"a37549b4-4c94-412d-acc4-4efeb08509be","shared_citers":26},{"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","shared_citers":23},{"title":"$\\tau$-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains","work_id":"6a8d8dc4-0cc0-4052-8109-abbcdcd4a962","shared_citers":20},{"title":"OpenHands: An Open Platform for AI Software Developers as Generalist Agents","work_id":"f1762ea0-e382-4f38-a28c-adc643789859","shared_citers":20},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":19},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":18},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":16},{"title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code","work_id":"ea9e51ce-1e75-4182-92d8-4d25f70d2ee4","shared_citers":16},{"title":"ReAct: Synergizing Reasoning and Acting in Language Models","work_id":"407a2351-25f1-497d-b611-f77d0292a8e6","shared_citers":15},{"title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models","work_id":"c5006563-f3ec-438a-9e35-b7b484f34828","shared_citers":14},{"title":"Terminal-Bench: Benchmarking Agents on Hard, Realistic Tasks in Command Line Interfaces","work_id":"0624be05-1d97-4fd6-8300-b04b8a3ab04b","shared_citers":13},{"title":"ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs","work_id":"3c555b48-a4d9-42dd-9fdd-0f6018fbe9cb","shared_citers":13},{"title":"Voyager: An Open-Ended Embodied Agent with Large Language Models","work_id":"ffe0d207-86cf-4742-a100-e988ac8b9676","shared_citers":11},{"title":"Agentless: Demystifying LLM-based Software Engineering Agents","work_id":"71c901c4-3c83-4e10-af54-3daef7fff397","shared_citers":10},{"title":"Proximal Policy Optimization Algorithms","work_id":"240c67fe-d14d-4520-91c1-38a4e272ca19","shared_citers":10},{"title":"SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering","work_id":"01826cd9-a652-403c-a2ec-531da9fe2b6a","shared_citers":10},{"title":"SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?","work_id":"a561c78a-4b02-4053-a92a-bc5c7c5f6b9b","shared_citers":10},{"title":"Teaching Large Language Models to Self-Debug","work_id":"cdfb2680-220c-44eb-9edd-867b75fb821d","shared_citers":10},{"title":"DAPO: An Open-Source LLM Reinforcement Learning System at Scale","work_id":"64019d00-0b11-4bbd-b173-b46c8fad0157","shared_citers":9},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":9},{"title":"Identifying the Risks of LM Agents with an LM-Emulated Sandbox","work_id":"3d4c3b66-d749-4939-b1bc-62b10b2ebbb6","shared_citers":9},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":9}],"time_series":[{"n":6,"year":2024},{"n":4,"year":2025},{"n":130,"year":2026}]},"authors":[{"id":"ccce1405-14be-4569-a696-5ab6460df1ea","orcid":null,"display_name":"Alexander Wettig","source":"manual","import_confidence":0.72},{"id":"af0b1ab2-1ba3-4c41-a2ae-af68600d9140","orcid":null,"display_name":"Carlos E. Jimenez","source":"manual","import_confidence":0.72},{"id":"7f1034d5-7758-4c81-addf-00ad64bdf933","orcid":null,"display_name":"John Yang","source":"manual","import_confidence":0.72},{"id":"fb7620d1-d79e-4ca8-82cd-583be4738edb","orcid":null,"display_name":"Kexin Pei","source":"manual","import_confidence":0.72},{"id":"f5def1ac-a050-43b1-8ab9-44fb276b2556","orcid":null,"display_name":"Ofir Press","source":"manual","import_confidence":0.72},{"id":"55ffc739-fb29-4c94-90fd-a74183bd62cf","orcid":null,"display_name":"Shunyu Yao","source":"manual","import_confidence":0.72}]}}