diff --git a/_posts/2024-05-31-both-swe-bench.md b/_posts/2024-05-31-both-swe-bench.md index 095ddedaa..29fec1851 100644 --- a/_posts/2024-05-31-both-swe-bench.md +++ b/_posts/2024-05-31-both-swe-bench.md @@ -32,7 +32,8 @@ for more details on the data presented in this chart. ## Interactive, not agentic Aider achieved this result mainly through its existing features that focus on static -code analysis, reliable LLM code editing, and pragmatic UX for AI pair programming. +code analysis, reliable LLM code editing, and pragmatic UX for automatically +fixing linting and testing errors. Aider intentionally has quite limited and narrow "agentic behavior" to avoid long delays, high token costs and the need for users to repeatedly code review incorrect solutions. @@ -203,20 +204,20 @@ The table below breaks down the benchmark outcome of each problem, showing whether aider with GPT-4o and with Opus produced plausible and/or correct solutions. -|Row|Aider
w/GPT-4o
solution
plausible?|Aider
w/GPT-4o
solution
resolved
issue?|Aider
w/Opus
solution
plausible?|Aider
w/Opus
solution
resolved
issue?|Number of
problems
with this
outcome| -|:--:|--:|--:|--:|--:|--:| -| A | plausible | resolved | n/a | n/a | 73 | -| B | plausible | not resolved | n/a | n/a | 181 | -| C | non-plausible | resolved | plausible | resolved | 1 | -| D | non-plausible | resolved | plausible | not resolved | 2 | -| E | non-plausible | resolved | non-plausible | resolved | 16 | -| F | non-plausible | resolved | non-plausible | not resolved | 5 | -| G | non-plausible | not resolved | non-plausible | resolved | 4 | -| H | non-plausible | not resolved | non-plausible | not resolved | 216 | -| I | non-plausible | not resolved | plausible | resolved | 12 | -| J | non-plausible | not resolved | plausible | not resolved | 53 | -| K | non-plausible | not resolved | n/a | n/a | 7 | -|Total|||||570| +|Row|Aider
w/GPT-4o
solution
plausible?|Aider
w/GPT-4o
solution
resolved
issue?|Aider
w/Opus
solution
plausible?|Aider
w/Opus
solution
resolved
issue?|Number of
problems
with this
outcome|Number of
problems
resolved| +|:--:|--:|--:|--:|--:|--:|--:| +| A | **plausible** | **resolved** | n/a | n/a | 73 | 73 | +| B | **plausible** | not resolved | n/a | n/a | 181 | 0 | +| C | non-plausible | **resolved** | **plausible** | **resolved** | 1 | 1 | +| D | non-plausible | **resolved** | **plausible** | not resolved | 2 | 0 | +| E | non-plausible | **resolved** | non-plausible | **resolved** | 16 | 16 | +| F | non-plausible | **resolved** | non-plausible | not resolved | 5 | 3 | +| G | non-plausible | not resolved | non-plausible | **resolved** | 4 | 2 | +| H | non-plausible | not resolved | non-plausible | not resolved | 216 | 0 | +| I | non-plausible | not resolved | **plausible** | **resolved** | 12 | 12 | +| J | non-plausible | not resolved | **plausible** | not resolved | 53 | 0 | +| K | non-plausible | not resolved | n/a | n/a | 7 | 0 | +|Total|||||570|107| Rows A-B show the cases where aider with GPT-4o found a plausible solution during the first attempt. diff --git a/assets/swe_bench.jpg b/assets/swe_bench.jpg index 73ce02c1a..4e75c4dd6 100644 Binary files a/assets/swe_bench.jpg and b/assets/swe_bench.jpg differ diff --git a/assets/swe_bench.svg b/assets/swe_bench.svg index 1ffb95c8f..78892a2a8 100644 --- a/assets/swe_bench.svg +++ b/assets/swe_bench.svg @@ -6,7 +6,7 @@ - 2024-06-01T07:33:17.926838 + 2024-06-01T11:25:56.978629 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -412,7 +412,7 @@ z - + @@ -583,7 +583,7 @@ z - + @@ -699,7 +699,7 @@ z - + @@ -894,7 +894,7 @@ z - + @@ -926,7 +926,7 @@ z - + @@ -1157,7 +1157,7 @@ z - + @@ -1339,16 +1339,16 @@ z +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1394,11 +1394,11 @@ z +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1467,11 +1467,11 @@ z +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1487,11 +1487,11 @@ L 690 242.500879 +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1523,11 +1523,11 @@ z +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1557,11 +1557,11 @@ z +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1578,11 +1578,11 @@ L 690 144.756199 +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1599,11 +1599,11 @@ L 690 112.174638 +" clip-path="url(#pf392d01723)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1780,7 +1780,7 @@ L 170.425134 307.664 L 170.425134 170.821447 L 104.863636 170.821447 z -" clip-path="url(#p73b7253dcf)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pf392d01723)" style="fill: #b3e6a8; opacity: 0.3"/> +" clip-path="url(#pf392d01723)" style="fill: #b3e6a8; opacity: 0.3"/> +" clip-path="url(#pf392d01723)" style="fill: #b3e6a8; opacity: 0.3"/> +" clip-path="url(#pf392d01723)" style="fill: #b3e6a8; opacity: 0.3"/> +" clip-path="url(#pf392d01723)" style="fill: #b3e6a8; opacity: 0.3"/> +" clip-path="url(#pf392d01723)" style="fill: #155f91; opacity: 0.9"/> +" clip-path="url(#pf392d01723)" style="fill: #155f91; opacity: 0.9"/> @@ -2356,7 +2356,7 @@ z - + diff --git a/benchmark/swe_bench.py b/benchmark/swe_bench.py index edc513d3d..a1aec6c29 100644 --- a/benchmark/swe_bench.py +++ b/benchmark/swe_bench.py @@ -51,7 +51,11 @@ def plot_swe_bench(data_file, is_lite): spine.set_edgecolor("#DDDDDD") spine.set_linewidth(0.5) - colors = ["#17965A" if "Aider" in model else "#b3d1e6" for model in models] + if is_lite: + colors = ["#17965A" if "Aider" in model else "#b3d1e6" for model in models] + else: + colors = ["#155F91" if "Aider" in model else "#b3e6a8" for model in models] + bars = [] for model, pass_rate, color in zip(models, pass_rates, colors): alpha = 0.9 if "Aider" in model else 0.3