diff --git a/_posts/2024-05-22-swe-bench-lite.md b/_posts/2024-05-22-swe-bench-lite.md index 44260d3ed..8033bf417 100644 --- a/_posts/2024-05-22-swe-bench-lite.md +++ b/_posts/2024-05-22-swe-bench-lite.md @@ -12,11 +12,17 @@ on the achieving a state-of-the-art result. The current top leaderboard entry is 20.3% from Amazon Q Developer Agent. -The best result reported elsewhere seems to be -[25% from OpenDevin](https://x.com/gneubig/status/1791498953709752405). [![SWE Bench Lite results](/assets/swe_bench_lite.svg)](https://aider.chat/assets/swe_bench_lite.svg) +Please see the [references](#references) +for details on the data presented in this chart. +It was updated 5/30/24 to reflect apples-to-apples comparisons, +using pass@1 results from AutoCodeRover +and results from OpenDevin that don't use hints. +The [official SWE Bench Lite leaderboard](https://www.swebench.com) +only accepts pass@1 results that do not use hints. + ## Interactive, not agentic Aider achieved this result mainly through its existing features that focus on static code analysis, reliable LLM code editing, and pragmatic UX for AI pair programming. @@ -397,14 +403,33 @@ making it faster, easier, and more reliable to run the acceptance tests. ## References Below are the references for the SWE-Bench Lite results -displayed in the graph at the top of this page. +displayed in the graph at the beginning of this article. -- [25.0% OpenDevin](https://x.com/gneubig/status/1791498953709752405) -- [19.0% AutoCodeRover](https://github.com/swe-bench/experiments/pull/11) - [20.3% Amazon Q Developer Agent (v20240430-dev)](https://www.swebench.com) +- [19.0% AutoCodeRover](https://github.com/swe-bench/experiments/pull/11) - [18.0% SWE-Agent + GPT-4](https://www.swebench.com) +- [16.7% OpenDevin](https://github.com/OpenDevin/OpenDevin/issues/2149) - [11.7% SWE-Agent + Opus](https://www.swebench.com) -Note: Graph updated on 5/30/24 to accurately reflect AutoCodeRover's pass@1 results. -The previous graph contained their pass@3 result, which is not comparable -to the aider results being reported here. +Note, the graph was updated on 5/30/24 as follows. + +The graph now contains AutoCodeRover's pass@1 results. +Previously it was reporting the pass@3 results, which are +not comparable +to the pass@1 aider results being reported here. +The [AutoCodeRover GitHub page](https://github.com/nus-apr/auto-code-rover) +features the pass@3 results +without being clearly labeled. + +The graph now contains the best OpenDevin results obtained without using +the `hints_text` to provide hints to the agent. +The previous graph contained their hinted result, +which is not comparable +to the unhinted aider results being reported here. +OpenDevin's [hinted result was reported](https://x.com/gneubig/status/1791498953709752405) +without noting that hints were used. + +The [official SWE Bench Lite leaderboard](https://www.swebench.com) +only accepts pass@1 results that do not use `hints_text`. + + diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg index e695a94e3..37cfd3cda 100644 Binary files a/assets/swe_bench_lite.jpg and b/assets/swe_bench_lite.jpg differ diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg index ae6934b48..ff7a2ec84 100644 --- a/assets/swe_bench_lite.svg +++ b/assets/swe_bench_lite.svg @@ -6,7 +6,7 @@ - 2024-05-30T09:44:47.592823 + 2024-05-30T15:26:12.767905 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -412,19 +412,89 @@ z - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + @@ -432,7 +502,7 @@ z - + - + + - + - + - + - + - - + + - + - + - + - + - - + @@ -886,49 +925,10 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -954,7 +954,7 @@ z - + @@ -1039,16 +1039,16 @@ z +" clip-path="url(#pec64ca441b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1083,18 +1083,18 @@ z - + - + - + - + - + - + - + - + - + @@ -1180,18 +1180,18 @@ L 690 161.676713 - + - + - + - + - + - + @@ -1404,62 +1404,62 @@ L 690 50.4 +" clip-path="url(#pec64ca441b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pec64ca441b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pec64ca441b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pec64ca441b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pec64ca441b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pec64ca441b)" style="fill: #17965a; opacity: 0.9"/> +" clip-path="url(#pec64ca441b)" style="fill: #17965a; opacity: 0.9"/> - + + + + + + + + + + + + + + - + - + - + - + - + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + - + - + - - - - - + + + + + @@ -1775,7 +1963,7 @@ z - + diff --git a/benchmark/swe-bench-lite.txt b/benchmark/swe-bench-lite.txt index b73faad22..c49c25187 100644 --- a/benchmark/swe-bench-lite.txt +++ b/benchmark/swe-bench-lite.txt @@ -1,7 +1,7 @@ 26.3% Aider|GPT-4o|& Opus 25.0% Aider|GPT-4o -25.0% Open|Devin 20.3% Amazon Q|Developer|Agent 19.0% AutoCode|Rover 18.0% SWE-|Agent|+ GPT-4 +16.7% Open|Devin 11.7% SWE-|Agent|+ Opus diff --git a/benchmark/swe_bench_lite.py b/benchmark/swe_bench_lite.py index afa76c1d4..a3bd04820 100644 --- a/benchmark/swe_bench_lite.py +++ b/benchmark/swe_bench_lite.py @@ -22,7 +22,13 @@ def plot_swe_bench_lite(data_file): plt.rcParams["hatch.color"] = "#444444" font_color = "#555" - rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10}) + font_params = { + "family": "sans-serif", + "sans-serif": ["Helvetica"], + "size": 10, + "weight": "bold", + } + rc("font", **font_params) plt.rcParams["text.color"] = font_color fig, ax = plt.subplots(figsize=(10, 5)) @@ -34,28 +40,31 @@ def plot_swe_bench_lite(data_file): colors = ["#17965A" if "Aider" in model else "#b3d1e6" for model in models] bars = [] for model, pass_rate, color in zip(models, pass_rates, colors): - alpha = 0.6 if "Aider" in model else 0.3 + alpha = 0.9 if "Aider" in model else 0.3 bar = ax.bar(model, pass_rate, color=color, alpha=alpha, zorder=3) bars.append(bar[0]) for model, bar in zip(models, bars): yval = bar.get_height() - y = yval + 0.75 if "Aider" in model else yval - 1.25 - va = "bottom" if "Aider" in model else "top" - + y = yval - 1.25 + va = "top" + color = "#eee" if "Aider" in model else "#555" + fontfamily = "Helvetica Bold" if "Aider" in model else "Helvetica" ax.text( bar.get_x() + bar.get_width() / 2, y, f"{yval}%", ha="center", va=va, - fontsize=14, + fontsize=16, + color=color, + fontfamily=fontfamily, ) # ax.set_xlabel("Models", fontsize=18) ax.set_ylabel("Instances resolved (%)", fontsize=18, color=font_color) ax.set_title("SWE Bench Lite", fontsize=20) - ax.set_ylim(0, 29.9) + # ax.set_ylim(0, 29.9) plt.xticks( fontsize=16, color=font_color,