diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index d23065415..972a80480 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -78,7 +78,7 @@ def show_stats(dirnames):
             row.model = gpt4 + "\n" + row.model[len(gpt4) :]
 
         if row.model == "gpt-4\n-1106-preview":
-            row.model += "\n(partial run)"
+            row.model += "\n(preliminary)"
 
         if row.completed_tests < 133:
             print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
@@ -164,7 +164,7 @@ def plot_timing(df):
     ax.set_xticks([p + 0.5 * width for p in pos])
     ax.set_xticklabels(models)
 
-    ax.set_ylabel("Average duration (seconds)")
+    ax.set_ylabel("Average GPT response time\nper exercise (sec)")
     ax.set_title("GPT Code Editing - Timing")
     ax.legend(
         title="Edit Format",