diff --git a/_posts/2024-05-22-swe-bench-lite.md b/_posts/2024-05-22-swe-bench-lite.md index e3a3048e3..cb89283fa 100644 --- a/_posts/2024-05-22-swe-bench-lite.md +++ b/_posts/2024-05-22-swe-bench-lite.md @@ -400,8 +400,9 @@ Below are the references for the SWE-Bench Lite results displayed in the graph at the top of this page. - [25.0% OpenDevin](https://x.com/gneubig/status/1791498953709752405) -- [22.3% AutoCodeRover](https://github.com/nus-apr/auto-code-rover) +- [19.0% AutoCodeRover](https://github.com/swe-bench/experiments/pull/11) - [20.3% Amazon Q Developer Agent (v20240430-dev)](https://www.swebench.com) - [18.0% SWE-Agent + GPT-4](https://www.swebench.com) - [11.7% SWE-Agent + Opus](https://www.swebench.com) +Note: Graph updated on 5/30/24 to accurately reflect AutoCodeRover's pass@1 results. \ No newline at end of file diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg index 1b7d106f2..e695a94e3 100644 Binary files a/assets/swe_bench_lite.jpg and b/assets/swe_bench_lite.jpg differ diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg index c27d32699..ae6934b48 100644 --- a/assets/swe_bench_lite.svg +++ b/assets/swe_bench_lite.svg @@ -6,7 +6,7 @@ - 2024-05-25T12:13:05.168797 + 2024-05-30T09:44:47.592823 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -412,7 +412,7 @@ z - + @@ -528,12 +528,178 @@ z - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - + - - @@ -762,7 +877,7 @@ z - + @@ -771,125 +886,10 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -928,7 +928,7 @@ z - + @@ -954,7 +954,7 @@ z - + @@ -1039,16 +1039,16 @@ z +" clip-path="url(#pc190475179)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1085,11 +1085,11 @@ z +" clip-path="url(#pc190475179)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1131,11 +1131,11 @@ z +" clip-path="url(#pc190475179)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1163,11 +1163,11 @@ z +" clip-path="url(#pc190475179)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1182,11 +1182,11 @@ L 690 161.676713 +" clip-path="url(#pc190475179)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1228,11 +1228,11 @@ z +" clip-path="url(#pc190475179)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1407,7 +1407,7 @@ L 163.368917 273.70025 L 163.368917 186.321891 L 96.917045 186.321891 z -" clip-path="url(#pbc28a2e89c)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pc190475179)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pc190475179)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pc190475179)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pc190475179)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pc190475179)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pc190475179)" style="fill: #17965a; opacity: 0.6"/> +" clip-path="url(#pc190475179)" style="fill: #17965a; opacity: 0.6"/> @@ -1542,8 +1542,50 @@ z + + + + + + + + + + + + + - + - - - - - - - - - - @@ -1743,7 +1775,7 @@ z - + diff --git a/benchmark/swe-bench-lite.txt b/benchmark/swe-bench-lite.txt index ea071b69d..b73faad22 100644 --- a/benchmark/swe-bench-lite.txt +++ b/benchmark/swe-bench-lite.txt @@ -1,7 +1,7 @@ 26.3% Aider|GPT-4o|& Opus 25.0% Aider|GPT-4o 25.0% Open|Devin -22.3% AutoCode|Rover 20.3% Amazon Q|Developer|Agent +19.0% AutoCode|Rover 18.0% SWE-|Agent|+ GPT-4 11.7% SWE-|Agent|+ Opus