From f5887a5098d6fee39781568a6711c67d723d76b8 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Sat, 3 Feb 2024 08:25:19 -0800
Subject: [PATCH] tweaking graph labels

---
 benchmark/benchmark.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index a8f97fa4a..0756411a8 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -79,8 +79,8 @@ def show_stats(dirnames, graphs):
         if "folk" in row.dir_name:
             row.edit_format += "folk"
 
-        if row.model == "gpt-4-0613":
-            row.model += "\n(8k context window is\ntoo small for benchmark)"
+        # if row.model == "gpt-4-0613":
+        #    row.model += "\n(8k context window is\ntoo small for benchmark)"
 
         if row.completed_tests < 89:
             print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
@@ -126,8 +126,8 @@ def show_stats(dirnames, graphs):
     # dump(df)
     if graphs:
         # plot_timing(df)
-        # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
-        plot_refactoring(df)
+        plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
+        # plot_refactoring(df)
 
 
 def plot_timing(df):
@@ -249,7 +249,13 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
         )
 
     ax.set_xticks([p + 0.5 * width for p in pos])
-    ax.set_xticklabels(models)
+    model_labels = []
+    for model in models:
+        pieces = model.split("-")
+        ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
+        model_labels.append(ml)
+
+    ax.set_xticklabels(model_labels)
 
     top = 95
     ax.annotate(