diff --git a/_posts/2024-05-31-both-swe-bench.md b/_posts/2024-05-31-both-swe-bench.md index 29fec1851..7ea6cf371 100644 --- a/_posts/2024-05-31-both-swe-bench.md +++ b/_posts/2024-05-31-both-swe-bench.md @@ -22,10 +22,10 @@ that was reported recently. [![SWE Bench results](/assets/swe_bench.svg)](https://aider.chat/assets/swe_bench.svg) -Aider was benchmarked on 570 of the 2294 SWE Bench problems. -These were the same -[randomly selected 570 problems](https://github.com/CognitionAI/devin-swebench-results/tree/main/output_diffs) that -[Devin used in their evaluation](https://www.cognition.ai/post/swe-bench-technical-report). +Aider was benchmarked on the same +[random 570](https://github.com/CognitionAI/devin-swebench-results/tree/main/output_diffs) +of the 2294 SWE Bench problems that were used in the +[Devin evaluation](https://www.cognition.ai/post/swe-bench-technical-report). Please see the [references](#references) for more details on the data presented in this chart. diff --git a/assets/swe_bench.jpg b/assets/swe_bench.jpg index 4e75c4dd6..cf3994d8a 100644 Binary files a/assets/swe_bench.jpg and b/assets/swe_bench.jpg differ diff --git a/assets/swe_bench.svg b/assets/swe_bench.svg index 78892a2a8..ffa620f2d 100644 --- a/assets/swe_bench.svg +++ b/assets/swe_bench.svg @@ -6,7 +6,7 @@ - 2024-06-01T11:25:56.978629 + 2024-06-01T11:46:22.003048 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -412,7 +412,7 @@ z - + @@ -583,7 +583,7 @@ z - + @@ -699,7 +699,7 @@ z - + @@ -894,7 +894,7 @@ z - + @@ -926,7 +926,7 @@ z - + @@ -1157,7 +1157,7 @@ z - + @@ -1339,16 +1339,16 @@ z +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1394,11 +1394,11 @@ z +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1467,11 +1467,11 @@ z +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1487,11 +1487,11 @@ L 690 242.500879 +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1523,11 +1523,11 @@ z +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1557,11 +1557,11 @@ z +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1578,11 +1578,11 @@ L 690 144.756199 +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1599,11 +1599,11 @@ L 690 112.174638 +" clip-path="url(#p4aa384bc7b)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1780,7 +1780,7 @@ L 170.425134 307.664 L 170.425134 170.821447 L 104.863636 170.821447 z -" clip-path="url(#pf392d01723)" style="fill: #b3e6a8; opacity: 0.3"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #155f91; opacity: 0.9"/> +" clip-path="url(#p4aa384bc7b)" style="fill: #155f91; opacity: 0.9"/> @@ -2356,7 +2356,7 @@ z - + diff --git a/benchmark/swe_bench.py b/benchmark/swe_bench.py index a1aec6c29..e0669937f 100644 --- a/benchmark/swe_bench.py +++ b/benchmark/swe_bench.py @@ -54,7 +54,7 @@ def plot_swe_bench(data_file, is_lite): if is_lite: colors = ["#17965A" if "Aider" in model else "#b3d1e6" for model in models] else: - colors = ["#155F91" if "Aider" in model else "#b3e6a8" for model in models] + colors = ["#155F91" if "Aider" in model else "#b3d1e6" for model in models] bars = [] for model, pass_rate, color in zip(models, pass_rates, colors):