From f5887a5098d6fee39781568a6711c67d723d76b8 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Sat, 3 Feb 2024 08:25:19 -0800 Subject: [PATCH] tweaking graph labels --- benchmark/benchmark.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index a8f97fa4a..0756411a8 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -79,8 +79,8 @@ def show_stats(dirnames, graphs): if "folk" in row.dir_name: row.edit_format += "folk" - if row.model == "gpt-4-0613": - row.model += "\n(8k context window is\ntoo small for benchmark)" + # if row.model == "gpt-4-0613": + # row.model += "\n(8k context window is\ntoo small for benchmark)" if row.completed_tests < 89: print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}") @@ -126,8 +126,8 @@ def show_stats(dirnames, graphs): # dump(df) if graphs: # plot_timing(df) - # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) - plot_refactoring(df) + plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) + # plot_refactoring(df) def plot_timing(df): @@ -249,7 +249,13 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg): ) ax.set_xticks([p + 0.5 * width for p in pos]) - ax.set_xticklabels(models) + model_labels = [] + for model in models: + pieces = model.split("-") + ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:]) + model_labels.append(ml) + + ax.set_xticklabels(model_labels) top = 95 ax.annotate(