refactored plots

2025-05-20 04:14:59 +00:00 · 2024-05-06 10:43:39 -07:00 · 2024-05-06 10:43:39 -07:00 · 5fb7a323ec
commit 5fb7a323ec
parent 3bb237bdc1
2 changed files with 418 additions and 415 deletions
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -16,12 +16,10 @@ from typing import List

 import git
 import lox
-import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 import prompts
 import typer
-from imgcat import imgcat
+from plots import plot_refactoring
 from rich.console import Console

 from aider import models
@ -130,418 +128,6 @@ def show_stats(dirnames, graphs):
        plot_refactoring(df)


-def plot_timing(df):
-    """plot a graph showing the average duration of each (model, edit_format)"""
-    plt.rcParams["hatch.linewidth"] = 0.5
-    plt.rcParams["hatch.color"] = "#444444"
-
-    from matplotlib import rc
-
-    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
-
-    fig, ax = plt.subplots(figsize=(6, 4))
-    ax.grid(axis="y", zorder=0, lw=0.2)
-
-    zorder = 1
-    grouped = df.groupby(["model", "edit_format"])["avg_duration"].mean().unstack()
-    num_models, num_formats = grouped.shape
-
-    pos = np.array(range(num_models))
-    width = 0.8 / num_formats
-
-    formats = grouped.columns
-    models = grouped.index
-
-    for i, fmt in enumerate(formats):
-        edge = dict(edgecolor="#ffffff", linewidth=1.5)
-        color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
-        hatch = "////" if "func" in fmt else ""
-        rects = ax.bar(
-            pos + i * width,
-            grouped[fmt],
-            width * 0.95,
-            label=fmt,
-            color=color,
-            hatch=hatch,
-            zorder=zorder + 1,
-            **edge,
-        )
-        ax.bar_label(rects, padding=4, labels=[f"{v:.1f}s" for v in grouped[fmt]], size=6)
-
-    ax.set_xticks([p + 0.5 * width for p in pos])
-    ax.set_xticklabels(models)
-
-    ax.set_ylabel("Average GPT response time\nper exercise (sec)")
-    ax.set_title("GPT Code Editing Speed\n(time per coding task)")
-    ax.legend(
-        title="Edit Format",
-        loc="upper left",
-    )
-    ax.set_ylim(top=max(grouped.max()) * 1.1)  # Set y-axis limit to 10% more than the max value
-
-    plt.tight_layout()
-    plt.savefig("tmp_timing.svg")
-    imgcat(fig)
-
-
-def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
-    tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()]
-    if True:
-        tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
-
-    plt.rcParams["hatch.linewidth"] = 0.5
-    plt.rcParams["hatch.color"] = "#444444"
-
-    from matplotlib import rc
-
-    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
-
-    fig, ax = plt.subplots(figsize=(6, 4))
-    ax.grid(axis="y", zorder=0, lw=0.2)
-
-    zorder = 1
-    for grouped in tries:
-        zorder += 1
-        df = grouped.unstack()
-        num_models, num_formats = df.shape
-
-        pos = np.array(range(num_models))
-        width = 0.8 / num_formats
-
-        formats = df.columns
-        models = df.index
-
-        for i, fmt in enumerate(formats):
-            if zorder > 1:
-                edge = dict(
-                    edgecolor="#ffffff",
-                    linewidth=1.5,
-                )
-            else:
-                edge = dict()
-            if zorder == 2:
-                edge["label"] = fmt
-
-            color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
-            hatch = "////" if "func" in fmt else ""
-            rects = ax.bar(
-                pos + i * width,
-                df[fmt],
-                width * 0.95,
-                color=color,
-                hatch=hatch,
-                zorder=zorder,
-                **edge,
-            )
-            if zorder == 2:
-                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
-
-    if len(repeats):
-        ax.errorbar(
-            1.4,
-            repeat_avg,
-            yerr=[[repeat_lo], [repeat_hi]],
-            fmt="none",
-            zorder=5,
-            capsize=2.5,
-            elinewidth=1,
-            markeredgewidth=1,
-        )
-
-    ax.set_xticks([p + 0.5 * width for p in pos])
-    model_labels = []
-    for model in models:
-        pieces = model.split("-")
-        ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
-        model_labels.append(ml)
-
-    ax.set_xticklabels(model_labels)
-
-    top = 95
-    ax.annotate(
-        "First attempt,\nbased on\nnatural language\ninstructions",
-        xy=(2.20, 41),
-        xytext=(2, top),
-        horizontalalignment="center",
-        verticalalignment="top",
-        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
-    )
-    ax.annotate(
-        "Second attempt,\nincluding unit test\nerror output",
-        xy=(2.55, 56),
-        xytext=(3.5, top),
-        horizontalalignment="center",
-        verticalalignment="top",
-        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
-    )
-
-    ax.set_ylabel("Percent of exercises completed successfully")
-    # ax.set_xlabel("Model")
-    ax.set_title("GPT Code Editing Skill\n(percent coding tasks correct)")
-    ax.legend(
-        title="Edit Format",
-        loc="upper left",
-        # bbox_to_anchor=(0.95, 0.95),
-    )
-    ax.set_ylim(top=100)
-
-    plt.tight_layout()
-    plt.savefig("tmp.svg")
-    imgcat(fig)
-
-    # df.to_csv("tmp.benchmarks.csv")
-
-
-def plot_outcomes_claude(df):
-    print(df)
-
-    # Fix wrong column label
-    df["model"] = df["model"].replace("gpt-4-0314", "gpt-4-0613")
-
-    tries = [
-        df[["model", "pass_rate_2"]],
-        df[["model", "pass_rate_1"]],
-    ]
-
-    plt.rcParams["hatch.linewidth"] = 0.5
-    plt.rcParams["hatch.color"] = "#444444"
-
-    from matplotlib import rc
-
-    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
-
-    fig, ax = plt.subplots(figsize=(6, 4))
-    ax.grid(axis="y", zorder=0, lw=0.2)
-
-    zorder = 1
-    for df in tries:
-        zorder += 1
-        print(df)
-
-        num_models, _ = df.shape
-        num_formats = 1
-
-        pos = np.array(range(num_models))
-        width = 0.6 / num_formats
-
-        if zorder > 1:
-            edge = dict(
-                edgecolor="#ffffff",
-                linewidth=1.5,
-            )
-        else:
-            edge = dict()
-        if zorder == 2:
-            edge["label"] = "??"
-
-        color = [
-            "#b3e6a8",
-            "#b3e6a8",
-            "#b3e6a8",
-            "#b3d1e6",
-        ]
-        hatch = [  # noqa: F841
-            "",
-            "",
-            "",
-            "",
-            "////",
-            "////",
-            "////",
-            "",
-            "////",
-        ]
-        hatch = [  # noqa: F841
-            "////",
-            "////",
-            "////",
-            "////",
-            "",
-            "",
-            "",
-            "////",
-            "",
-        ]
-        rects = ax.bar(
-            pos + 0.5 * width,
-            df.iloc[:, 1],
-            width * 0.95,
-            color=color,
-            # hatch=hatch,
-            # zorder=zorder,
-            **edge,
-        )
-        if zorder == 2:
-            ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
-
-    ax.set_xticks([p + 0.5 * width for p in pos])
-
-    models = df.iloc[:, 0]
-    model_map = {
-        "gpt-4-0613": "gpt-4-\n0613",
-        "gpt-4-0125-preview": "gpt-4-\n0125-preview",
-        "gpt-4-1106-preview": "gpt-4-\n1106-preview",
-        "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
-    }
-    model_labels = []
-    for model in models:
-        ml = model_map.get(model, model)
-        model_labels.append(ml)
-    ax.set_xticklabels(model_labels, rotation=0)
-
-    top = 95
-    ax.annotate(
-        "First attempt,\nbased on\nnatural language\ninstructions",
-        xy=(1.0, 53),
-        xytext=(0.75, top),
-        horizontalalignment="center",
-        verticalalignment="top",
-        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
-    )
-    ax.annotate(
-        "Second attempt,\nincluding unit test\nerror output",
-        xy=(1.55, 65),
-        xytext=(1.9, top),
-        horizontalalignment="center",
-        verticalalignment="top",
-        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
-    )
-
-    ax.set_ylabel("Percent of exercises completed successfully")
-    # ax.set_xlabel("Model")
-    ax.set_title("Code Editing Skill")
-    # ax.legend(
-    #    title="Model family",
-    #    loc="upper left",
-    # )
-    ax.set_ylim(top=100)
-
-    plt.tight_layout()
-    plt.savefig("tmp.svg")
-    imgcat(fig)
-
-    # df.to_csv("tmp.benchmarks.csv")
-
-
-def plot_refactoring(df):
-    tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
-
-    plt.rcParams["hatch.linewidth"] = 0.5
-    plt.rcParams["hatch.color"] = "#444444"
-
-    from matplotlib import rc
-
-    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
-
-    fig, ax = plt.subplots(figsize=(6, 4))
-    ax.grid(axis="y", zorder=0, lw=0.2)
-
-    zorder = 1
-    for grouped in tries:
-        zorder += 1
-        df = grouped.unstack()
-
-        i, j = 0, 1
-        temp = df.iloc[i].copy()
-        df.iloc[i], df.iloc[j] = df.iloc[j], temp
-        dump(df)
-
-        # df.sort_values(by=["model"], ascending=False, inplace=True)
-        num_models, num_formats = df.shape
-
-        pos = np.array(range(num_models))
-        width = 0.8 / num_formats
-
-        formats = df.columns
-        models = df.index
-
-        dump(df)
-        dump(models)
-        dump(formats)
-        for i, fmt in enumerate(formats):
-            hatch = ""
-
-            if fmt == "diff":
-                color = "#b3e6a8"
-                label = "Search/replace blocks"
-            elif fmt == "udiff":
-                color = "#b3d1e6"
-                label = "Unified diffs"
-            elif fmt == "difffolk":
-                label = "Baseline + blind, no hands, $2k tip, etc"
-                color = "#b3e6a8"
-                hatch = "////"
-            elif fmt == "udifffolk":
-                label = "Unified diffs + blind, no hands, $2k tip, etc"
-                color = "#b3d1e6"
-                hatch = "////"
-
-            if zorder > 1:
-                edge = dict(
-                    edgecolor="#ffffff",
-                    linewidth=1.5,
-                )
-            else:
-                edge = dict()
-            if zorder == 2:
-                edge["label"] = label
-
-            color = [
-                "#b3e6a8",
-                "#b3e6a8",
-                "#b3d1e6",
-            ]
-
-            rects = ax.bar(
-                pos + i * width,
-                df[fmt],
-                width * 0.95,
-                color=color,
-                hatch=hatch,
-                zorder=zorder,
-                **edge,
-            )
-
-            if zorder == 2:
-                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
-
-    ax.set_xticks([p + 0 * width for p in pos])
-
-    model_map = {
-        "gpt-4-0125-preview": "gpt-4-\n0125-preview",
-        "gpt-4-1106-preview": "gpt-4-\n1106-preview",
-        "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
-    }
-    model_labels = []
-
-    for model in models:
-        ml = model_map.get(model, model)
-        model_labels.append(ml)
-
-    model_labels = [
-        "gpt-4-\n1106-preview",
-        "gpt-4-\n0125-preview",
-        "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
-    ]
-    ax.set_xticklabels(model_labels, rotation=0)
-
-    ax.set_ylabel("Percent of exercises completed successfully")
-    # ax.set_xlabel("Model")
-    ax.set_title('Refactoring "Laziness" Benchmark')
-    # ax.legend(
-    # title="Edit Format",
-    # loc="upper left",
-    # bbox_to_anchor=(0.95, 0.95),
-    # )
-    ax.set_ylim(top=100)
-
-    plt.tight_layout()
-    plt.savefig("tmp.svg")
-    imgcat(fig)
-
-    # df.to_csv("tmp.benchmarks.csv")
-
-
 def resolve_dirname(dirname, use_single_prior, make_new):
    if len(dirname.parts) > 1:
        return dirname
--- a/benchmark/plots.py
+++ b/benchmark/plots.py
@ -0,0 +1,417 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from imgcat import imgcat
+
+from aider.dump import dump  # noqa: F401
+
+
+def plot_timing(df):
+    """plot a graph showing the average duration of each (model, edit_format)"""
+    plt.rcParams["hatch.linewidth"] = 0.5
+    plt.rcParams["hatch.color"] = "#444444"
+
+    from matplotlib import rc
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.grid(axis="y", zorder=0, lw=0.2)
+
+    zorder = 1
+    grouped = df.groupby(["model", "edit_format"])["avg_duration"].mean().unstack()
+    num_models, num_formats = grouped.shape
+
+    pos = np.array(range(num_models))
+    width = 0.8 / num_formats
+
+    formats = grouped.columns
+    models = grouped.index
+
+    for i, fmt in enumerate(formats):
+        edge = dict(edgecolor="#ffffff", linewidth=1.5)
+        color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
+        hatch = "////" if "func" in fmt else ""
+        rects = ax.bar(
+            pos + i * width,
+            grouped[fmt],
+            width * 0.95,
+            label=fmt,
+            color=color,
+            hatch=hatch,
+            zorder=zorder + 1,
+            **edge,
+        )
+        ax.bar_label(rects, padding=4, labels=[f"{v:.1f}s" for v in grouped[fmt]], size=6)
+
+    ax.set_xticks([p + 0.5 * width for p in pos])
+    ax.set_xticklabels(models)
+
+    ax.set_ylabel("Average GPT response time\nper exercise (sec)")
+    ax.set_title("GPT Code Editing Speed\n(time per coding task)")
+    ax.legend(
+        title="Edit Format",
+        loc="upper left",
+    )
+    ax.set_ylim(top=max(grouped.max()) * 1.1)  # Set y-axis limit to 10% more than the max value
+
+    plt.tight_layout()
+    plt.savefig("tmp_timing.svg")
+    imgcat(fig)
+
+
+def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
+    tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()]
+    if True:
+        tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
+
+    plt.rcParams["hatch.linewidth"] = 0.5
+    plt.rcParams["hatch.color"] = "#444444"
+
+    from matplotlib import rc
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.grid(axis="y", zorder=0, lw=0.2)
+
+    zorder = 1
+    for grouped in tries:
+        zorder += 1
+        df = grouped.unstack()
+        num_models, num_formats = df.shape
+
+        pos = np.array(range(num_models))
+        width = 0.8 / num_formats
+
+        formats = df.columns
+        models = df.index
+
+        for i, fmt in enumerate(formats):
+            if zorder > 1:
+                edge = dict(
+                    edgecolor="#ffffff",
+                    linewidth=1.5,
+                )
+            else:
+                edge = dict()
+            if zorder == 2:
+                edge["label"] = fmt
+
+            color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
+            hatch = "////" if "func" in fmt else ""
+            rects = ax.bar(
+                pos + i * width,
+                df[fmt],
+                width * 0.95,
+                color=color,
+                hatch=hatch,
+                zorder=zorder,
+                **edge,
+            )
+            if zorder == 2:
+                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
+
+    if len(repeats):
+        ax.errorbar(
+            1.4,
+            repeat_avg,
+            yerr=[[repeat_lo], [repeat_hi]],
+            fmt="none",
+            zorder=5,
+            capsize=2.5,
+            elinewidth=1,
+            markeredgewidth=1,
+        )
+
+    ax.set_xticks([p + 0.5 * width for p in pos])
+    model_labels = []
+    for model in models:
+        pieces = model.split("-")
+        ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
+        model_labels.append(ml)
+
+    ax.set_xticklabels(model_labels)
+
+    top = 95
+    ax.annotate(
+        "First attempt,\nbased on\nnatural language\ninstructions",
+        xy=(2.20, 41),
+        xytext=(2, top),
+        horizontalalignment="center",
+        verticalalignment="top",
+        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
+    )
+    ax.annotate(
+        "Second attempt,\nincluding unit test\nerror output",
+        xy=(2.55, 56),
+        xytext=(3.5, top),
+        horizontalalignment="center",
+        verticalalignment="top",
+        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
+    )
+
+    ax.set_ylabel("Percent of exercises completed successfully")
+    # ax.set_xlabel("Model")
+    ax.set_title("GPT Code Editing Skill\n(percent coding tasks correct)")
+    ax.legend(
+        title="Edit Format",
+        loc="upper left",
+        # bbox_to_anchor=(0.95, 0.95),
+    )
+    ax.set_ylim(top=100)
+
+    plt.tight_layout()
+    plt.savefig("tmp.svg")
+    imgcat(fig)
+
+    # df.to_csv("tmp.benchmarks.csv")
+
+
+def plot_outcomes_claude(df):
+    print(df)
+
+    # Fix wrong column label
+    df["model"] = df["model"].replace("gpt-4-0314", "gpt-4-0613")
+
+    tries = [
+        df[["model", "pass_rate_2"]],
+        df[["model", "pass_rate_1"]],
+    ]
+
+    plt.rcParams["hatch.linewidth"] = 0.5
+    plt.rcParams["hatch.color"] = "#444444"
+
+    from matplotlib import rc
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.grid(axis="y", zorder=0, lw=0.2)
+
+    zorder = 1
+    for df in tries:
+        zorder += 1
+        print(df)
+
+        num_models, _ = df.shape
+        num_formats = 1
+
+        pos = np.array(range(num_models))
+        width = 0.6 / num_formats
+
+        if zorder > 1:
+            edge = dict(
+                edgecolor="#ffffff",
+                linewidth=1.5,
+            )
+        else:
+            edge = dict()
+        if zorder == 2:
+            edge["label"] = "??"
+
+        color = [
+            "#b3e6a8",
+            "#b3e6a8",
+            "#b3e6a8",
+            "#b3d1e6",
+        ]
+        hatch = [  # noqa: F841
+            "",
+            "",
+            "",
+            "",
+            "////",
+            "////",
+            "////",
+            "",
+            "////",
+        ]
+        hatch = [  # noqa: F841
+            "////",
+            "////",
+            "////",
+            "////",
+            "",
+            "",
+            "",
+            "////",
+            "",
+        ]
+        rects = ax.bar(
+            pos + 0.5 * width,
+            df.iloc[:, 1],
+            width * 0.95,
+            color=color,
+            # hatch=hatch,
+            # zorder=zorder,
+            **edge,
+        )
+        if zorder == 2:
+            ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
+
+    ax.set_xticks([p + 0.5 * width for p in pos])
+
+    models = df.iloc[:, 0]
+    model_map = {
+        "gpt-4-0613": "gpt-4-\n0613",
+        "gpt-4-0125-preview": "gpt-4-\n0125-preview",
+        "gpt-4-1106-preview": "gpt-4-\n1106-preview",
+        "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
+    }
+    model_labels = []
+    for model in models:
+        ml = model_map.get(model, model)
+        model_labels.append(ml)
+    ax.set_xticklabels(model_labels, rotation=0)
+
+    top = 95
+    ax.annotate(
+        "First attempt,\nbased on\nnatural language\ninstructions",
+        xy=(1.0, 53),
+        xytext=(0.75, top),
+        horizontalalignment="center",
+        verticalalignment="top",
+        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
+    )
+    ax.annotate(
+        "Second attempt,\nincluding unit test\nerror output",
+        xy=(1.55, 65),
+        xytext=(1.9, top),
+        horizontalalignment="center",
+        verticalalignment="top",
+        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
+    )
+
+    ax.set_ylabel("Percent of exercises completed successfully")
+    # ax.set_xlabel("Model")
+    ax.set_title("Code Editing Skill")
+    # ax.legend(
+    #    title="Model family",
+    #    loc="upper left",
+    # )
+    ax.set_ylim(top=100)
+
+    plt.tight_layout()
+    plt.savefig("tmp.svg")
+    imgcat(fig)
+
+    # df.to_csv("tmp.benchmarks.csv")
+
+
+def plot_refactoring(df):
+    tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
+
+    plt.rcParams["hatch.linewidth"] = 0.5
+    plt.rcParams["hatch.color"] = "#444444"
+
+    from matplotlib import rc
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.grid(axis="y", zorder=0, lw=0.2)
+
+    zorder = 1
+    for grouped in tries:
+        zorder += 1
+        df = grouped.unstack()
+
+        i, j = 0, 1
+        temp = df.iloc[i].copy()
+        df.iloc[i], df.iloc[j] = df.iloc[j], temp
+        dump(df)
+
+        # df.sort_values(by=["model"], ascending=False, inplace=True)
+        num_models, num_formats = df.shape
+
+        pos = np.array(range(num_models))
+        width = 0.8 / num_formats
+
+        formats = df.columns
+        models = df.index
+
+        dump(df)
+        dump(models)
+        dump(formats)
+        for i, fmt in enumerate(formats):
+            hatch = ""
+
+            if fmt == "diff":
+                color = "#b3e6a8"
+                label = "Search/replace blocks"
+            elif fmt == "udiff":
+                color = "#b3d1e6"
+                label = "Unified diffs"
+            elif fmt == "difffolk":
+                label = "Baseline + blind, no hands, $2k tip, etc"
+                color = "#b3e6a8"
+                hatch = "////"
+            elif fmt == "udifffolk":
+                label = "Unified diffs + blind, no hands, $2k tip, etc"
+                color = "#b3d1e6"
+                hatch = "////"
+
+            if zorder > 1:
+                edge = dict(
+                    edgecolor="#ffffff",
+                    linewidth=1.5,
+                )
+            else:
+                edge = dict()
+            if zorder == 2:
+                edge["label"] = label
+
+            color = [
+                "#b3e6a8",
+                "#b3e6a8",
+                "#b3d1e6",
+            ]
+
+            rects = ax.bar(
+                pos + i * width,
+                df[fmt],
+                width * 0.95,
+                color=color,
+                hatch=hatch,
+                zorder=zorder,
+                **edge,
+            )
+
+            if zorder == 2:
+                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
+
+    ax.set_xticks([p + 0 * width for p in pos])
+
+    model_map = {
+        "gpt-4-0125-preview": "gpt-4-\n0125-preview",
+        "gpt-4-1106-preview": "gpt-4-\n1106-preview",
+        "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
+    }
+    model_labels = []
+
+    for model in models:
+        ml = model_map.get(model, model)
+        model_labels.append(ml)
+
+    model_labels = [
+        "gpt-4-\n1106-preview",
+        "gpt-4-\n0125-preview",
+        "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
+    ]
+    ax.set_xticklabels(model_labels, rotation=0)
+
+    ax.set_ylabel("Percent of exercises completed successfully")
+    # ax.set_xlabel("Model")
+    ax.set_title('Refactoring "Laziness" Benchmark')
+    # ax.legend(
+    # title="Edit Format",
+    # loc="upper left",
+    # bbox_to_anchor=(0.95, 0.95),
+    # )
+    ax.set_ylim(top=100)
+
+    plt.tight_layout()
+    plt.savefig("tmp.svg")
+    imgcat(fig)
+
+    # df.to_csv("tmp.benchmarks.csv")