Added udiff graph

2025-05-29 16:54:59 +00:00 · 2023-12-18 09:53:28 -08:00 · 2023-12-18 09:53:28 -08:00 · 6ab2db192c
commit 6ab2db192c
parent 0de715461a
3 changed files with 1847 additions and 14 deletions
--- a/assets/benchmarks-udiff.svg
+++ b/assets/benchmarks-udiff.svg
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -77,6 +77,9 @@ def show_stats(dirnames, graphs):
        elif row.model.startswith(gpt4):
            row.model = gpt4 + "\n" + row.model[len(gpt4) :]
        if "folk" in row.dir_name:
            row.edit_format = "folk"
        # if row.model == "gpt-4\n-1106-preview":
        #    row.model += "\n(preliminary)"
@ -116,15 +119,16 @@ def show_stats(dirnames, graphs):
        # use the average in the main bar
        rows[repeat_row]["pass_rate_2"] = repeat_avg
    else:
-        repeat_hi = repeat_lo = repeat_avg = None
+        repeat_hi = repeat_lo = repeat_avg = None  # noqa: F841
    df = pd.DataFrame.from_records(rows)
    df.sort_values(by=["model", "edit_format"], inplace=True)
    # dump(df)
    if graphs:
-        plot_timing(df)
+        # plot_timing(df)
-        plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
+        # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
        plot_refactoring(df)
 def plot_timing(df):
@ -283,6 +287,88 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
    # df.to_csv("tmp.benchmarks.csv")
 def plot_refactoring(df):
    tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
    plt.rcParams["hatch.linewidth"] = 0.5
    plt.rcParams["hatch.color"] = "#444444"
    from matplotlib import rc
    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.grid(axis="y", zorder=0, lw=0.2)
    zorder = 1
    for grouped in tries:
        zorder += 1
        df = grouped.unstack()
        num_models, num_formats = df.shape
        pos = np.array(range(num_models))
        width = 0.8 / num_formats
        formats = df.columns
        models = df.index
        for i, fmt in enumerate(formats):
            hatch = ""
            if fmt == "diff":
                color = "#b3e6a8"
                label = "Baseline (search/replace blocks)"
            elif fmt == "udiff":
                color = "#b3d1e6"
                label = "Unified diffs"
            elif fmt == "folk":
                label = "Folk remedy prompt (blind, no hands, ...)"
                color = "#b3e6a8"
                hatch = "////"
            if zorder > 1:
                edge = dict(
                    edgecolor="#ffffff",
                    linewidth=1.5,
                )
            else:
                edge = dict()
            if zorder == 2:
                edge["label"] = label
            rects = ax.bar(
                pos + i * width,
                df[fmt],
                width * 0.95,
                color=color,
                hatch=hatch,
                zorder=zorder,
                **edge,
            )
            if zorder == 2:
                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
    ax.set_xticks([p + 0.5 * width for p in pos])
    ax.set_xticklabels(models)
    ax.set_ylabel("Percent of exercises completed successfully")
    # ax.set_xlabel("Model")
    ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
    ax.legend(
        title="Edit Format",
        loc="upper left",
        # bbox_to_anchor=(0.95, 0.95),
    )
    ax.set_ylim(top=100)
    plt.tight_layout()
    plt.savefig("tmp.svg")
    imgcat(fig)
    # df.to_csv("tmp.benchmarks.csv")
 def resolve_dirname(dirname, use_single_prior, make_new):
    if len(dirname.parts) > 1:
        return dirname
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@ -1,7 +1,7 @@
 # Fixing GPT-4 Turbo laziness with unified diffs
-![robot flowchart](../assets/udiffs.jpg)
+![robot flowchart](../assets/benchmarks-udiff.svg)
 Aider now asks GPT-4 Turbo to use
@ -15,10 +15,8 @@ Aider also has a new benchmarking suite
 designed to both provoke and quantify lazy coding.
 It consists of
 39 python refactoring tasks,
-which ask GPT to remove a non-trivial method from a class and make it
+which tend to make GPT-4 Turbo very lazy,
-a stand alone function.
+often resulting in comments like
 GPT-4 Turbo is prone to being lazy on this sort of task,
 often leaving comments like
 "...include the original method body...".
 This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
@ -56,8 +54,8 @@ and evaluate their significance using ablation experiments.
 ## Unified diff editing format
 The design and implementation of aider's new unified diff editing format
-helped clarify some general principles, which I think are applicable to any effective
+helped clarify some general principles
-GPT-4 code editing format:
+for GPT-4 code editing:
 - FAMILIAR - Choose an edit format that GPT is already familiar with.
 - SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts.
@ -68,9 +66,7 @@ A helpful shortcut here is to have empathy for GPT, and imagine you
 are the one being asked to specify code edits.
 Would you want to hand type a properly escaped json data structure
 to invoke surgical insert, delete, replace operations on specific code line numbers?
-How would you feel about
+How would you feel about any mistake causing all your work to be discarded?
 errors firing
 after any typo, off-by-one line number or flubbed escape sequence?
 GPT is quantitatively better at code editing when you reduce the
 burden of formatting edits by using a familiar, simple, high level
@ -93,7 +89,7 @@ default output format of `git diff`:
     return
 ```
-Choosing such a familiar, popular output format means that GPT has
+Choosing such a popular output format means that GPT has
 seen *many* examples in its training data.
 It's been trained to generate
 text that conforms to the unified diff syntax.