From 976fc7a83677f18df1af72e85cb5dde13f301bdf Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Mon, 6 Nov 2023 18:26:02 -0800 Subject: [PATCH] update benchmarking script --- benchmark/benchmark.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 72f007c49..d445ac59f 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -43,7 +43,7 @@ def show_stats(dirnames): row = summarize_results(dirname) raw_rows.append(row) - return + # return repeats = [] seen = dict() @@ -54,6 +54,10 @@ def show_stats(dirnames): if row.model == "gpt-3.5-turbo": row.model = "gpt-3.5-turbo-0613" + + if row.model == "gpt-4": + row.model = "gpt-4-0613" + if row.edit_format == "diff-func-string": row.edit_format = "diff-func" @@ -65,10 +69,16 @@ def show_stats(dirnames): # remember this row, so we can update it with the repeat_avg repeat_row = len(rows) - pieces = row.model.split("-") - row.model = "-".join(pieces[:3]) - if pieces[3:]: - row.model += "\n-" + "-".join(pieces[3:]) + gpt35 = "gpt-3.5-turbo" + gpt4 = "gpt-4" + + if row.model.startswith(gpt35): + row.model = gpt35 + "\n" + row.model[len(gpt35) :] + elif row.model.startswith(gpt4): + row.model = gpt4 + "\n" + row.model[len(gpt4) :] + + if row.model == "gpt-4\n-1106-preview": + row.model += "\n(partial run)" if row.completed_tests < 133: print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}") @@ -108,6 +118,8 @@ def show_stats(dirnames): df = pd.DataFrame.from_records(rows) df.sort_values(by=["model", "edit_format"], inplace=True) + dump(df) + tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()] if True: tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()] @@ -171,22 +183,22 @@ def show_stats(dirnames): markeredgewidth=1, ) - ax.set_xticks([p + 1.5 * width for p in pos]) + ax.set_xticks([p + 0.5 * width for p in pos]) ax.set_xticklabels(models) top = 95 ax.annotate( - "First attempt,\nbased on\ninstructions", - xy=(2.9, 51), - xytext=(2.5, top), + "First attempt,\nbased on\nnatural language\ninstructions", + xy=(2.25, 40), + xytext=(2, top), horizontalalignment="center", verticalalignment="top", arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, ) ax.annotate( "Second attempt,\nbased on\nunit test errors", - xy=(3.1, 68), - xytext=(4.25, top), + xy=(2.55, 58), + xytext=(3.5, top), horizontalalignment="center", verticalalignment="top", arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},