Added udiff graph

2025-05-29 16:54:59 +00:00 · 2023-12-18 09:53:28 -08:00 · 2023-12-18 09:53:28 -08:00 · 6ab2db192c
commit 6ab2db192c
parent 0de715461a
3 changed files with 1847 additions and 14 deletions
--- a/assets/benchmarks-udiff.svg
+++ b/assets/benchmarks-udiff.svg
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -77,6 +77,9 @@ def show_stats(dirnames, graphs):
        elif row.model.startswith(gpt4):
            row.model = gpt4 + "\n" + row.model[len(gpt4) :]

+        if "folk" in row.dir_name:
+            row.edit_format = "folk"
+
        # if row.model == "gpt-4\n-1106-preview":
        #    row.model += "\n(preliminary)"

@ -116,15 +119,16 @@ def show_stats(dirnames, graphs):
        # use the average in the main bar
        rows[repeat_row]["pass_rate_2"] = repeat_avg
    else:
-        repeat_hi = repeat_lo = repeat_avg = None
+        repeat_hi = repeat_lo = repeat_avg = None  # noqa: F841

    df = pd.DataFrame.from_records(rows)
    df.sort_values(by=["model", "edit_format"], inplace=True)

    # dump(df)
    if graphs:
-        plot_timing(df)
-        plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
+        # plot_timing(df)
+        # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
+        plot_refactoring(df)


 def plot_timing(df):
@ -283,6 +287,88 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
    # df.to_csv("tmp.benchmarks.csv")


+def plot_refactoring(df):
+    tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
+
+    plt.rcParams["hatch.linewidth"] = 0.5
+    plt.rcParams["hatch.color"] = "#444444"
+
+    from matplotlib import rc
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.grid(axis="y", zorder=0, lw=0.2)
+
+    zorder = 1
+    for grouped in tries:
+        zorder += 1
+        df = grouped.unstack()
+        num_models, num_formats = df.shape
+
+        pos = np.array(range(num_models))
+        width = 0.8 / num_formats
+
+        formats = df.columns
+        models = df.index
+
+        for i, fmt in enumerate(formats):
+            hatch = ""
+
+            if fmt == "diff":
+                color = "#b3e6a8"
+                label = "Baseline (search/replace blocks)"
+            elif fmt == "udiff":
+                color = "#b3d1e6"
+                label = "Unified diffs"
+            elif fmt == "folk":
+                label = "Folk remedy prompt (blind, no hands, ...)"
+                color = "#b3e6a8"
+                hatch = "////"
+
+            if zorder > 1:
+                edge = dict(
+                    edgecolor="#ffffff",
+                    linewidth=1.5,
+                )
+            else:
+                edge = dict()
+            if zorder == 2:
+                edge["label"] = label
+
+            rects = ax.bar(
+                pos + i * width,
+                df[fmt],
+                width * 0.95,
+                color=color,
+                hatch=hatch,
+                zorder=zorder,
+                **edge,
+            )
+
+            if zorder == 2:
+                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
+
+    ax.set_xticks([p + 0.5 * width for p in pos])
+    ax.set_xticklabels(models)
+
+    ax.set_ylabel("Percent of exercises completed successfully")
+    # ax.set_xlabel("Model")
+    ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
+    ax.legend(
+        title="Edit Format",
+        loc="upper left",
+        # bbox_to_anchor=(0.95, 0.95),
+    )
+    ax.set_ylim(top=100)
+
+    plt.tight_layout()
+    plt.savefig("tmp.svg")
+    imgcat(fig)
+
+    # df.to_csv("tmp.benchmarks.csv")
+
+
 def resolve_dirname(dirname, use_single_prior, make_new):
    if len(dirname.parts) > 1:
        return dirname
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@ -1,7 +1,7 @@

 # Fixing GPT-4 Turbo laziness with unified diffs

-![robot flowchart](../assets/udiffs.jpg)
+![robot flowchart](../assets/benchmarks-udiff.svg)


 Aider now asks GPT-4 Turbo to use
@ -15,10 +15,8 @@ Aider also has a new benchmarking suite
 designed to both provoke and quantify lazy coding.
 It consists of
 39 python refactoring tasks,
-which ask GPT to remove a non-trivial method from a class and make it
-a stand alone function.
-GPT-4 Turbo is prone to being lazy on this sort of task,
-often leaving comments like
+which tend to make GPT-4 Turbo very lazy,
+often resulting in comments like
 "...include the original method body...".

 This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
@ -56,8 +54,8 @@ and evaluate their significance using ablation experiments.
 ## Unified diff editing format

 The design and implementation of aider's new unified diff editing format
-helped clarify some general principles, which I think are applicable to any effective
-GPT-4 code editing format:
+helped clarify some general principles
+for GPT-4 code editing:

 - FAMILIAR - Choose an edit format that GPT is already familiar with.
 - SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts.
@ -68,9 +66,7 @@ A helpful shortcut here is to have empathy for GPT, and imagine you
 are the one being asked to specify code edits.
 Would you want to hand type a properly escaped json data structure
 to invoke surgical insert, delete, replace operations on specific code line numbers?
-How would you feel about
-errors firing
-after any typo, off-by-one line number or flubbed escape sequence?
+How would you feel about any mistake causing all your work to be discarded?

 GPT is quantitatively better at code editing when you reduce the
 burden of formatting edits by using a familiar, simple, high level
@ -93,7 +89,7 @@ default output format of `git diff`:
     return
 ```

-Choosing such a familiar, popular output format means that GPT has
+Choosing such a popular output format means that GPT has
 seen *many* examples in its training data.
 It's been trained to generate
 text that conforms to the unified diff syntax.