diff --git a/_posts/2024-05-22-swe-bench-lite.md b/_posts/2024-05-22-swe-bench-lite.md index 9aee4e78c..d6f51e69a 100644 --- a/_posts/2024-05-22-swe-bench-lite.md +++ b/_posts/2024-05-22-swe-bench-lite.md @@ -14,7 +14,7 @@ achieving a state-of-the-art result. The current top leaderboard entry is 20.3% from Amazon Q Developer Agent. The best result reported elsewhere seems to be -[22.3% from AutoCodeRover](https://github.com/nus-apr/auto-code-rover). +[25% from OpenDevin](https://x.com/gneubig/status/1791498953709752405) [![SWE Bench Lite results](/assets/swe_bench_lite.svg)](https://aider.chat/assets/swe_bench_lite.svg) @@ -89,7 +89,7 @@ or if the AI starts going down a wrong path. Running the benchmark harness only using aider with GPT-4o to find plausible solutions achieved a score of 25.0%. -This was itself a state-of-the-art result, before being surpassed by the main +This was itself matching the state-of-the-art, before being surpassed by the main result being reported here that used aider with both GPT-4o & Opus. @@ -400,8 +400,8 @@ making it faster, easier, and more reliable to run the acceptance tests. Below are the references for the SWE-Bench Lite results displayed in the graph at the top of this page. +- 25.0% OpenDevin https://x.com/gneubig/status/1791498953709752405 - 22.3% AutoCodeRover https://github.com/nus-apr/auto-code-rover -- 21.0% OpenDevin https://github.com/OpenDevin/OpenDevin - 20.3% Amazon Q Developer Agent (v20240430-dev) https://www.swebench.com - 18.0% SWE-Agent + GPT-4 https://www.swebench.com - 11.7% SWE-Agent + Opus https://www.swebench.com diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg index 40adf98fd..1b7d106f2 100644 Binary files a/assets/swe_bench_lite.jpg and b/assets/swe_bench_lite.jpg differ diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg index 0e67796c9..c27d32699 100644 --- a/assets/swe_bench_lite.svg +++ b/assets/swe_bench_lite.svg @@ -6,7 +6,7 @@ - 2024-05-23T13:12:59.895266 + 2024-05-25T12:13:05.168797 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -412,7 +412,7 @@ z - + @@ -528,7 +528,7 @@ z - + @@ -774,51 +774,12 @@ z - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + - + @@ -954,7 +954,7 @@ z - + @@ -1039,16 +1039,16 @@ z +" clip-path="url(#pbc28a2e89c)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1085,11 +1085,11 @@ z +" clip-path="url(#pbc28a2e89c)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1131,11 +1131,11 @@ z +" clip-path="url(#pbc28a2e89c)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1163,11 +1163,11 @@ z +" clip-path="url(#pbc28a2e89c)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1182,11 +1182,11 @@ L 690 161.676713 +" clip-path="url(#pbc28a2e89c)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1228,11 +1228,11 @@ z +" clip-path="url(#pbc28a2e89c)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1407,7 +1407,7 @@ L 163.368917 273.70025 L 163.368917 186.321891 L 96.917045 186.321891 z -" clip-path="url(#p0eec7a4844)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #17965a; opacity: 0.6"/> +" clip-path="url(#pbc28a2e89c)" style="fill: #17965a; opacity: 0.6"/> @@ -1588,18 +1588,8 @@ z - - - - - - - - - - - + @@ -1607,6 +1597,16 @@ z + + + + + + + + + + @@ -1743,7 +1743,7 @@ z - + diff --git a/benchmark/swe-bench-lite.txt b/benchmark/swe-bench-lite.txt index 4dae1d9fe..ea071b69d 100644 --- a/benchmark/swe-bench-lite.txt +++ b/benchmark/swe-bench-lite.txt @@ -1,7 +1,7 @@ 26.3% Aider|GPT-4o|& Opus 25.0% Aider|GPT-4o +25.0% Open|Devin 22.3% AutoCode|Rover -21.0% Open|Devin 20.3% Amazon Q|Developer|Agent 18.0% SWE-|Agent|+ GPT-4 11.7% SWE-|Agent|+ Opus