diff --git a/assets/swe_bench.jpg b/assets/swe_bench.jpg index f5aaf8ebc..73ce02c1a 100644 Binary files a/assets/swe_bench.jpg and b/assets/swe_bench.jpg differ diff --git a/assets/swe_bench.svg b/assets/swe_bench.svg index def4a99e8..1ffb95c8f 100644 --- a/assets/swe_bench.svg +++ b/assets/swe_bench.svg @@ -1,12 +1,12 @@ - + - 2024-06-01T07:22:29.978035 + 2024-06-01T07:33:17.926838 image/svg+xml @@ -21,8 +21,8 @@ - - - - + - + - + - + - - - - - - - - - - - - - - - - - + - + - + - + - - - - - - - - - - + - + - + @@ -727,7 +603,7 @@ z - + + @@ -799,26 +694,17 @@ z - - - - - - - - - - + - + - + - + - - - - - - - - - - + - + - - - - - - - - - - - - - - + - + - + - - - - - - - - - - - - - - - - + - + @@ -1456,7 +1170,7 @@ z - + @@ -1465,7 +1179,7 @@ z - + - - - - - - - - - + - - + - + + - + - + - + + + + + @@ -1685,18 +1465,18 @@ L 690 272.001248 - + - + - + @@ -1705,18 +1485,34 @@ L 690 239.866496 - + - + - + + + + @@ -1725,18 +1521,18 @@ L 690 207.731745 - + - + - + - + - + - + @@ -1780,18 +1576,18 @@ L 690 143.462241 - + - + - + @@ -1801,18 +1597,18 @@ L 690 111.327489 - + - + - + @@ -1822,7 +1618,7 @@ L 690 79.192738 - + + + @@ -1929,18 +1755,18 @@ z - - - @@ -1949,64 +1775,64 @@ L 690 50.4 " style="fill: none; stroke: #dddddd; stroke-width: 0.5; stroke-linejoin: miter; stroke-linecap: square"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #17965a; opacity: 0.9"/> - +" clip-path="url(#p73b7253dcf)" style="fill: #17965a; opacity: 0.9"/> - + @@ -2016,7 +1842,7 @@ z - + - + @@ -2068,7 +1894,7 @@ z - + - + + + + @@ -2163,7 +2021,7 @@ z - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -2360,149 +2354,10 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg index 0286d6134..23032d3b8 100644 Binary files a/assets/swe_bench_lite.jpg and b/assets/swe_bench_lite.jpg differ diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg index b3dc7cdfb..4317ae672 100644 --- a/assets/swe_bench_lite.svg +++ b/assets/swe_bench_lite.svg @@ -1,12 +1,12 @@ - + - 2024-06-01T07:22:45.471733 + 2024-06-01T07:33:14.155906 image/svg+xml @@ -21,8 +21,8 @@ - - - - + - + - + - + - + - + - + - + - + - + @@ -502,7 +502,7 @@ z - + - + - + - + - + - + - + - + @@ -928,12 +928,12 @@ z - + - + - + - + - + @@ -1172,7 +1172,7 @@ z - + @@ -1181,7 +1181,7 @@ z - + - + - - + - + - + - + - + - + - + - + - + - + - + @@ -1482,18 +1482,18 @@ L 690 184.367965 - + - + - + - + - + - + @@ -1547,7 +1547,7 @@ L 690 78.170609 - + - - - @@ -1704,64 +1704,64 @@ L 690 50.4 " style="fill: none; stroke: #dddddd; stroke-width: 0.5; stroke-linejoin: miter; stroke-linecap: square"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #b3d1e6; opacity: 0.3"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #17965a; opacity: 0.9"/> - +" clip-path="url(#p64bcd2b177)" style="fill: #17965a; opacity: 0.9"/> - + - + - + - + - + - + - + - + + diff --git a/benchmark/swe_bench.py b/benchmark/swe_bench.py index 52b0dcf82..edc513d3d 100644 --- a/benchmark/swe_bench.py +++ b/benchmark/swe_bench.py @@ -14,15 +14,24 @@ def plot_swe_bench(data_file, is_lite): models = [] pass_rates = [] - + instances = [] for line in lines: if line.strip(): pass_rate, model = line.split("%") model = model.strip() + if "(" in model: + pieces = model.split("(") + model = pieces[0] + ins = pieces[1].strip(")") + else: + ins = None + instances.insert(0, ins) model = model.replace("|", "\n") models.insert(0, model.strip()) pass_rates.insert(0, float(pass_rate.strip())) + dump(instances) + plt.rcParams["hatch.linewidth"] = 0.5 plt.rcParams["hatch.color"] = "#444444" @@ -36,7 +45,7 @@ def plot_swe_bench(data_file, is_lite): rc("font", **font_params) plt.rcParams["text.color"] = font_color - fig, ax = plt.subplots(figsize=(10, 6)) + fig, ax = plt.subplots(figsize=(10, 5.5)) ax.grid(axis="y", zorder=0, lw=0.2) for spine in ax.spines.values(): spine.set_edgecolor("#DDDDDD") @@ -73,6 +82,23 @@ def plot_swe_bench(data_file, is_lite): fontfamily=fontfamily, ) + for model, ins, bar in zip(models, instances, bars): + if not ins: + continue + yval = bar.get_height() + y = yval - 2.5 + va = "top" + color = "#eee" if "Aider" in model else "#555" + ax.text( + bar.get_x() + bar.get_width() / 2, + y, + f"of {ins}", + ha="center", + va=va, + fontsize=12, + color=color, + ) + # ax.set_xlabel("Models", fontsize=18) ax.set_ylabel("Instances resolved (%)", fontsize=18, color=font_color) if is_lite: @@ -86,22 +112,7 @@ def plot_swe_bench(data_file, is_lite): color=font_color, ) - if is_lite: - plt.tight_layout(pad=3.0) - else: - # Add note at the bottom of the graph - note = "(570) and (2294) denote the number of SWE Bench instances benchmarked" - plt.figtext( - 0.5, - 0.05, - note, - wrap=True, - horizontalalignment="center", - fontsize=12, - color=font_color, - ) - - plt.tight_layout(pad=3.0, rect=[0, 0.05, 1, 1]) + plt.tight_layout(pad=3.0) out_fname = Path(data_file.replace("-", "_")) plt.savefig(out_fname.with_suffix(".jpg").name)