diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 39e8d23a1..d35875a95 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -50,6 +50,8 @@ def show_stats(dirnames):
 
         if row.model == "gpt-3.5-turbo":
             row.model = "gpt-3.5-turbo-0613"
+        if row.edit_format == "diff-func-string":
+            row.edit_format = "diff-func"
 
         if row.completed_tests < 133:
             print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
@@ -69,12 +71,15 @@ def show_stats(dirnames):
     # df_grouped1 = df.groupby(["model", "edit_format"])["pass_rate_1"].mean()
     df_grouped2 = df.groupby(["model", "edit_format"])["pass_rate_2"].mean()
 
-    dump(df_grouped2.unstack())
-
     plt.rcParams["hatch.linewidth"] = 0.5
     plt.rcParams["hatch.color"] = "#444444"
 
-    fig, ax = plt.subplots(figsize=(10, 6))
+    from matplotlib import rc
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 18})
+
+    fig, ax = plt.subplots(figsize=(12, 8))
+    ax.grid(axis="y", zorder=0, lw=0.2)
     """
     colors = ["#007BFF", "#89CFF0", "#008000", "#32CD32"]
     df_grouped2.unstack().plot(
@@ -85,8 +90,7 @@ def show_stats(dirnames):
     """
     df = df_grouped2.unstack()
     num_models, num_formats = df.shape
-    dump(num_models)
-    dump(num_formats)
+
     pos = np.array(range(num_models))
     width = 0.8 / num_formats
 
@@ -94,20 +98,33 @@ def show_stats(dirnames):
     models = df.index
 
     for i, fmt in enumerate(formats):
-        dump(df[fmt])
         color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
         hatch = "///" if "func" in fmt else ""
-        ax.bar(pos + i * width, df[fmt], width * 0.95, label=fmt, color=color, hatch=hatch)
+        rects = ax.bar(
+            pos + i * width,
+            df[fmt],
+            width * 0.95,
+            label=fmt,
+            color=color,
+            hatch=hatch,
+            zorder=3,
+        )
+        ax.bar_label(rects, padding=2, labels=[f"{v:.0f}%" for v in df[fmt]], size=12)
 
     ax.set_xticks([p + 1.5 * width for p in pos])
-    ax.set_xticklabels(models)
+    ax.set_xticklabels(models, rotation=45)
 
     ax.set_ylabel("Percent of passed unittests")
     # ax.set_xlabel("Model")
     ax.set_title("Code editing success rate by model & edit format")
-    ax.legend(title="Edit Format")
-
+    ax.legend(
+        title="Edit Format",
+        loc="upper left",
+        # bbox_to_anchor=(0.95, 0.95),
+    )
+    ax.set_ylim(top=100)
     plt.tight_layout()
+    plt.savefig("tmp.svg")
     imgcat(fig)
 
     # df.to_csv("tmp.benchmarks.csv")