diff --git a/assets/benchmarks-udiff.svg b/assets/benchmarks-udiff.svg new file mode 100644 index 000000000..014f04508 --- /dev/null +++ b/assets/benchmarks-udiff.svg @@ -0,0 +1,1751 @@ + + + + + + + + 2023-12-18T09:51:00.014416 + image/svg+xml + + + Matplotlib v3.8.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 9f934dc93..2eca49100 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -77,6 +77,9 @@ def show_stats(dirnames, graphs): elif row.model.startswith(gpt4): row.model = gpt4 + "\n" + row.model[len(gpt4) :] + if "folk" in row.dir_name: + row.edit_format = "folk" + # if row.model == "gpt-4\n-1106-preview": # row.model += "\n(preliminary)" @@ -116,15 +119,16 @@ def show_stats(dirnames, graphs): # use the average in the main bar rows[repeat_row]["pass_rate_2"] = repeat_avg else: - repeat_hi = repeat_lo = repeat_avg = None + repeat_hi = repeat_lo = repeat_avg = None # noqa: F841 df = pd.DataFrame.from_records(rows) df.sort_values(by=["model", "edit_format"], inplace=True) # dump(df) if graphs: - plot_timing(df) - plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) + # plot_timing(df) + # plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) + plot_refactoring(df) def plot_timing(df): @@ -283,6 +287,88 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg): # df.to_csv("tmp.benchmarks.csv") +def plot_refactoring(df): + tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()] + + plt.rcParams["hatch.linewidth"] = 0.5 + plt.rcParams["hatch.color"] = "#444444" + + from matplotlib import rc + + rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10}) + + fig, ax = plt.subplots(figsize=(6, 4)) + ax.grid(axis="y", zorder=0, lw=0.2) + + zorder = 1 + for grouped in tries: + zorder += 1 + df = grouped.unstack() + num_models, num_formats = df.shape + + pos = np.array(range(num_models)) + width = 0.8 / num_formats + + formats = df.columns + models = df.index + + for i, fmt in enumerate(formats): + hatch = "" + + if fmt == "diff": + color = "#b3e6a8" + label = "Baseline (search/replace blocks)" + elif fmt == "udiff": + color = "#b3d1e6" + label = "Unified diffs" + elif fmt == "folk": + label = "Folk remedy prompt (blind, no hands, ...)" + color = "#b3e6a8" + hatch = "////" + + if zorder > 1: + edge = dict( + edgecolor="#ffffff", + linewidth=1.5, + ) + else: + edge = dict() + if zorder == 2: + edge["label"] = label + + rects = ax.bar( + pos + i * width, + df[fmt], + width * 0.95, + color=color, + hatch=hatch, + zorder=zorder, + **edge, + ) + + if zorder == 2: + ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6) + + ax.set_xticks([p + 0.5 * width for p in pos]) + ax.set_xticklabels(models) + + ax.set_ylabel("Percent of exercises completed successfully") + # ax.set_xlabel("Model") + ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)') + ax.legend( + title="Edit Format", + loc="upper left", + # bbox_to_anchor=(0.95, 0.95), + ) + ax.set_ylim(top=100) + + plt.tight_layout() + plt.savefig("tmp.svg") + imgcat(fig) + + # df.to_csv("tmp.benchmarks.csv") + + def resolve_dirname(dirname, use_single_prior, make_new): if len(dirname.parts) > 1: return dirname diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index a67c7e114..64e961e9c 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -1,7 +1,7 @@ # Fixing GPT-4 Turbo laziness with unified diffs -![robot flowchart](../assets/udiffs.jpg) +![robot flowchart](../assets/benchmarks-udiff.svg) Aider now asks GPT-4 Turbo to use @@ -15,10 +15,8 @@ Aider also has a new benchmarking suite designed to both provoke and quantify lazy coding. It consists of 39 python refactoring tasks, -which ask GPT to remove a non-trivial method from a class and make it -a stand alone function. -GPT-4 Turbo is prone to being lazy on this sort of task, -often leaving comments like +which tend to make GPT-4 Turbo very lazy, +often resulting in comments like "...include the original method body...". This new laziness benchmark produced the following results with `gpt-4-1106-preview`: @@ -56,8 +54,8 @@ and evaluate their significance using ablation experiments. ## Unified diff editing format The design and implementation of aider's new unified diff editing format -helped clarify some general principles, which I think are applicable to any effective -GPT-4 code editing format: +helped clarify some general principles +for GPT-4 code editing: - FAMILIAR - Choose an edit format that GPT is already familiar with. - SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts. @@ -68,9 +66,7 @@ A helpful shortcut here is to have empathy for GPT, and imagine you are the one being asked to specify code edits. Would you want to hand type a properly escaped json data structure to invoke surgical insert, delete, replace operations on specific code line numbers? -How would you feel about -errors firing -after any typo, off-by-one line number or flubbed escape sequence? +How would you feel about any mistake causing all your work to be discarded? GPT is quantitatively better at code editing when you reduce the burden of formatting edits by using a familiar, simple, high level @@ -93,7 +89,7 @@ default output format of `git diff`: return ``` -Choosing such a familiar, popular output format means that GPT has +Choosing such a popular output format means that GPT has seen *many* examples in its training data. It's been trained to generate text that conforms to the unified diff syntax.