diff --git a/_posts/2024-05-31-both-swe-bench.md b/_posts/2024-05-31-both-swe-bench.md index 22b28f020..774b38dad 100644 --- a/_posts/2024-05-31-both-swe-bench.md +++ b/_posts/2024-05-31-both-swe-bench.md @@ -239,7 +239,7 @@ We can see that Opus overrides them with plausible-but-incorrect solutions resulting in 0 resolved problems from that row. -Rows G-K we cover the cases where neither model +Rows G-K cover the cases where neither model produced plausible solutions. Which solution was ultimately selected for each problem depends on [details about which solution the harness considered "most plausible"](https://aider.chat/2024/05/22/swe-bench-lite.html#finding-a-plausible-solution). diff --git a/assets/swe_bench.jpg b/assets/swe_bench.jpg index cf3994d8a..85b84b8c9 100644 Binary files a/assets/swe_bench.jpg and b/assets/swe_bench.jpg differ diff --git a/assets/swe_bench.svg b/assets/swe_bench.svg index ffa620f2d..cb02c77e7 100644 --- a/assets/swe_bench.svg +++ b/assets/swe_bench.svg @@ -6,7 +6,7 @@ - 2024-06-01T11:46:22.003048 + 2024-06-01T14:47:44.878771 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -412,7 +412,7 @@ z - + @@ -583,7 +583,7 @@ z - + @@ -699,7 +699,7 @@ z - + @@ -894,7 +894,7 @@ z - + @@ -926,7 +926,7 @@ z - + @@ -1157,7 +1157,7 @@ z - + @@ -1339,16 +1339,16 @@ z +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1394,11 +1394,11 @@ z +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1467,11 +1467,11 @@ z +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1487,11 +1487,11 @@ L 690 242.500879 +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1523,11 +1523,11 @@ z +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1557,11 +1557,11 @@ z +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1578,11 +1578,11 @@ L 690 144.756199 +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1599,11 +1599,11 @@ L 690 112.174638 +" clip-path="url(#pfeb5048445)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1780,7 +1780,7 @@ L 170.425134 307.664 L 170.425134 170.821447 L 104.863636 170.821447 z -" clip-path="url(#p4aa384bc7b)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pfeb5048445)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pfeb5048445)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pfeb5048445)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pfeb5048445)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pfeb5048445)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#pfeb5048445)" style="fill: #1a75c2; opacity: 0.9"/> +" clip-path="url(#pfeb5048445)" style="fill: #1a75c2; opacity: 0.9"/> @@ -2356,7 +2356,7 @@ z - + diff --git a/benchmark/swe_bench.py b/benchmark/swe_bench.py index e0669937f..7e2ac9d81 100644 --- a/benchmark/swe_bench.py +++ b/benchmark/swe_bench.py @@ -54,7 +54,7 @@ def plot_swe_bench(data_file, is_lite): if is_lite: colors = ["#17965A" if "Aider" in model else "#b3d1e6" for model in models] else: - colors = ["#155F91" if "Aider" in model else "#b3d1e6" for model in models] + colors = ["#1A75C2" if "Aider" in model else "#b3d1e6" for model in models] bars = [] for model, pass_rate, color in zip(models, pass_rates, colors):