diff --git a/_posts/2024-05-31-both-swe-bench.md b/_posts/2024-05-31-both-swe-bench.md index 3cddc6e59..095ddedaa 100644 --- a/_posts/2024-05-31-both-swe-bench.md +++ b/_posts/2024-05-31-both-swe-bench.md @@ -23,7 +23,7 @@ that was reported recently. [![SWE Bench results](/assets/swe_bench.svg)](https://aider.chat/assets/swe_bench.svg) Aider was benchmarked on 570 of the 2294 SWE Bench problems. -These are the same +These were the same [randomly selected 570 problems](https://github.com/CognitionAI/devin-swebench-results/tree/main/output_diffs) that [Devin used in their evaluation](https://www.cognition.ai/post/swe-bench-technical-report). Please see the [references](#references) @@ -251,7 +251,7 @@ In these cases aider with Opus was unable to produce any solutions. ## Computing the benchmark score -Benchmarking produced one proposed solution for each of +The benchmark harness produced one proposed solution for each of the 570 SWE Bench problems. A separate evaluation script was used to diff --git a/assets/swe_bench.jpg b/assets/swe_bench.jpg index 1796f2720..4ce2881c3 100644 Binary files a/assets/swe_bench.jpg and b/assets/swe_bench.jpg differ diff --git a/assets/swe_bench.svg b/assets/swe_bench.svg index 8abdd70a8..ffd8dbe1d 100644 --- a/assets/swe_bench.svg +++ b/assets/swe_bench.svg @@ -6,7 +6,7 @@ - 2024-05-31T11:41:49.017547 + 2024-06-01T07:02:59.687095 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -527,7 +527,7 @@ z - + @@ -707,7 +707,7 @@ z - + @@ -813,7 +813,7 @@ z - + @@ -1017,7 +1017,7 @@ z - + @@ -1120,7 +1120,7 @@ z - + @@ -1154,7 +1154,7 @@ z - + @@ -1247,16 +1247,16 @@ z +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1281,11 +1281,11 @@ z +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1301,11 +1301,11 @@ L 690 274.534192 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1321,11 +1321,11 @@ L 690 242.032134 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1341,11 +1341,11 @@ L 690 209.530076 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1375,11 +1375,11 @@ z +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1396,11 +1396,11 @@ L 690 144.52596 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1417,11 +1417,11 @@ L 690 112.023902 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1568,7 +1568,7 @@ L 170.425134 307.03625 L 170.425134 170.527606 L 104.863636 170.527606 z -" clip-path="url(#p1ec2c53f8e)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #17965a; opacity: 0.9"/> +" clip-path="url(#p22faac38c8)" style="fill: #17965a; opacity: 0.9"/> @@ -2006,60 +2006,9 @@ z - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + diff --git a/benchmark/swe_bench_lite.py b/benchmark/swe_bench_lite.py index fe9489cb5..2151cf53c 100644 --- a/benchmark/swe_bench_lite.py +++ b/benchmark/swe_bench_lite.py @@ -6,7 +6,7 @@ from imgcat import imgcat from matplotlib import rc -def plot_swe_bench_lite(data_file): +def plot_swe_bench(data_file, is_lite): with open(data_file, "r") as file: lines = file.readlines() @@ -45,7 +45,7 @@ def plot_swe_bench_lite(data_file): for model, pass_rate, color in zip(models, pass_rates, colors): alpha = 0.9 if "Aider" in model else 0.3 hatch = "" - # if "lite" not in data_file: + # if is_lite: # hatch = "///" if "(570)" in model else "" bar = ax.bar(model, pass_rate, color=color, alpha=alpha, zorder=3, hatch=hatch) bars.append(bar[0]) @@ -69,7 +69,7 @@ def plot_swe_bench_lite(data_file): # ax.set_xlabel("Models", fontsize=18) ax.set_ylabel("Instances resolved (%)", fontsize=18, color=font_color) - if "lite" in data_file: + if is_lite: title = "SWE Bench Lite" else: title = "SWE Bench" @@ -80,21 +80,22 @@ def plot_swe_bench_lite(data_file): color=font_color, ) - # Add note at the bottom of the graph - note = ( - "Note: (570) and (2294) refer to the number of SWE Bench instances that were benchmarked." - ) - plt.figtext( - 0.5, - 0.05, - note, - wrap=True, - horizontalalignment="center", - fontsize=12, - color=font_color, - ) + if is_lite: + plt.tight_layout(pad=3.0) + else: + # Add note at the bottom of the graph + note = "(570) and (2294) denote the number of SWE Bench instances benchmarked" + plt.figtext( + 0.5, + 0.05, + note, + wrap=True, + horizontalalignment="center", + fontsize=12, + color=font_color, + ) - plt.tight_layout(pad=3.0, rect=[0, 0.05, 1, 1]) + plt.tight_layout(pad=3.0, rect=[0, 0.05, 1, 1]) out_fname = Path(data_file.replace("-", "_")) plt.savefig(out_fname.with_suffix(".jpg").name) @@ -104,4 +105,6 @@ def plot_swe_bench_lite(data_file): fname = sys.argv[1] -plot_swe_bench_lite(fname) +is_lite = "lite" in fname + +plot_swe_bench(fname, is_lite)