Added udiff graph

This commit is contained in:
Paul Gauthier 2023-12-18 09:53:28 -08:00
parent 0de715461a
commit 6ab2db192c
3 changed files with 1847 additions and 14 deletions

1751
assets/benchmarks-udiff.svg Normal file

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 44 KiB

View file

@ -77,6 +77,9 @@ def show_stats(dirnames, graphs):
elif row.model.startswith(gpt4):
row.model = gpt4 + "\n" + row.model[len(gpt4) :]
if "folk" in row.dir_name:
row.edit_format = "folk"
# if row.model == "gpt-4\n-1106-preview":
# row.model += "\n(preliminary)"
@ -116,15 +119,16 @@ def show_stats(dirnames, graphs):
# use the average in the main bar
rows[repeat_row]["pass_rate_2"] = repeat_avg
else:
repeat_hi = repeat_lo = repeat_avg = None
repeat_hi = repeat_lo = repeat_avg = None # noqa: F841
df = pd.DataFrame.from_records(rows)
df.sort_values(by=["model", "edit_format"], inplace=True)
# dump(df)
if graphs:
plot_timing(df)
plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
# plot_timing(df)
# plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
plot_refactoring(df)
def plot_timing(df):
@ -283,6 +287,88 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
# df.to_csv("tmp.benchmarks.csv")
def plot_refactoring(df):
tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
for i, fmt in enumerate(formats):
hatch = ""
if fmt == "diff":
color = "#b3e6a8"
label = "Baseline (search/replace blocks)"
elif fmt == "udiff":
color = "#b3d1e6"
label = "Unified diffs"
elif fmt == "folk":
label = "Folk remedy prompt (blind, no hands, ...)"
color = "#b3e6a8"
hatch = "////"
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = label
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
ax.set_xticklabels(models)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
ax.legend(
title="Edit Format",
loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
)
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def resolve_dirname(dirname, use_single_prior, make_new):
if len(dirname.parts) > 1:
return dirname

View file

@ -1,7 +1,7 @@
# Fixing GPT-4 Turbo laziness with unified diffs
![robot flowchart](../assets/udiffs.jpg)
![robot flowchart](../assets/benchmarks-udiff.svg)
Aider now asks GPT-4 Turbo to use
@ -15,10 +15,8 @@ Aider also has a new benchmarking suite
designed to both provoke and quantify lazy coding.
It consists of
39 python refactoring tasks,
which ask GPT to remove a non-trivial method from a class and make it
a stand alone function.
GPT-4 Turbo is prone to being lazy on this sort of task,
often leaving comments like
which tend to make GPT-4 Turbo very lazy,
often resulting in comments like
"...include the original method body...".
This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
@ -56,8 +54,8 @@ and evaluate their significance using ablation experiments.
## Unified diff editing format
The design and implementation of aider's new unified diff editing format
helped clarify some general principles, which I think are applicable to any effective
GPT-4 code editing format:
helped clarify some general principles
for GPT-4 code editing:
- FAMILIAR - Choose an edit format that GPT is already familiar with.
- SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts.
@ -68,9 +66,7 @@ A helpful shortcut here is to have empathy for GPT, and imagine you
are the one being asked to specify code edits.
Would you want to hand type a properly escaped json data structure
to invoke surgical insert, delete, replace operations on specific code line numbers?
How would you feel about
errors firing
after any typo, off-by-one line number or flubbed escape sequence?
How would you feel about any mistake causing all your work to be discarded?
GPT is quantitatively better at code editing when you reduce the
burden of formatting edits by using a familiar, simple, high level
@ -93,7 +89,7 @@ default output format of `git diff`:
return
```
Choosing such a familiar, popular output format means that GPT has
Choosing such a popular output format means that GPT has
seen *many* examples in its training data.
It's been trained to generate
text that conforms to the unified diff syntax.