mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-29 16:54:59 +00:00
Added udiff graph
This commit is contained in:
parent
0de715461a
commit
6ab2db192c
3 changed files with 1847 additions and 14 deletions
1751
assets/benchmarks-udiff.svg
Normal file
1751
assets/benchmarks-udiff.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 44 KiB |
|
@ -77,6 +77,9 @@ def show_stats(dirnames, graphs):
|
||||||
elif row.model.startswith(gpt4):
|
elif row.model.startswith(gpt4):
|
||||||
row.model = gpt4 + "\n" + row.model[len(gpt4) :]
|
row.model = gpt4 + "\n" + row.model[len(gpt4) :]
|
||||||
|
|
||||||
|
if "folk" in row.dir_name:
|
||||||
|
row.edit_format = "folk"
|
||||||
|
|
||||||
# if row.model == "gpt-4\n-1106-preview":
|
# if row.model == "gpt-4\n-1106-preview":
|
||||||
# row.model += "\n(preliminary)"
|
# row.model += "\n(preliminary)"
|
||||||
|
|
||||||
|
@ -116,15 +119,16 @@ def show_stats(dirnames, graphs):
|
||||||
# use the average in the main bar
|
# use the average in the main bar
|
||||||
rows[repeat_row]["pass_rate_2"] = repeat_avg
|
rows[repeat_row]["pass_rate_2"] = repeat_avg
|
||||||
else:
|
else:
|
||||||
repeat_hi = repeat_lo = repeat_avg = None
|
repeat_hi = repeat_lo = repeat_avg = None # noqa: F841
|
||||||
|
|
||||||
df = pd.DataFrame.from_records(rows)
|
df = pd.DataFrame.from_records(rows)
|
||||||
df.sort_values(by=["model", "edit_format"], inplace=True)
|
df.sort_values(by=["model", "edit_format"], inplace=True)
|
||||||
|
|
||||||
# dump(df)
|
# dump(df)
|
||||||
if graphs:
|
if graphs:
|
||||||
plot_timing(df)
|
# plot_timing(df)
|
||||||
plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
|
# plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
|
||||||
|
plot_refactoring(df)
|
||||||
|
|
||||||
|
|
||||||
def plot_timing(df):
|
def plot_timing(df):
|
||||||
|
@ -283,6 +287,88 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
|
||||||
# df.to_csv("tmp.benchmarks.csv")
|
# df.to_csv("tmp.benchmarks.csv")
|
||||||
|
|
||||||
|
|
||||||
|
def plot_refactoring(df):
|
||||||
|
tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
|
||||||
|
|
||||||
|
plt.rcParams["hatch.linewidth"] = 0.5
|
||||||
|
plt.rcParams["hatch.color"] = "#444444"
|
||||||
|
|
||||||
|
from matplotlib import rc
|
||||||
|
|
||||||
|
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(6, 4))
|
||||||
|
ax.grid(axis="y", zorder=0, lw=0.2)
|
||||||
|
|
||||||
|
zorder = 1
|
||||||
|
for grouped in tries:
|
||||||
|
zorder += 1
|
||||||
|
df = grouped.unstack()
|
||||||
|
num_models, num_formats = df.shape
|
||||||
|
|
||||||
|
pos = np.array(range(num_models))
|
||||||
|
width = 0.8 / num_formats
|
||||||
|
|
||||||
|
formats = df.columns
|
||||||
|
models = df.index
|
||||||
|
|
||||||
|
for i, fmt in enumerate(formats):
|
||||||
|
hatch = ""
|
||||||
|
|
||||||
|
if fmt == "diff":
|
||||||
|
color = "#b3e6a8"
|
||||||
|
label = "Baseline (search/replace blocks)"
|
||||||
|
elif fmt == "udiff":
|
||||||
|
color = "#b3d1e6"
|
||||||
|
label = "Unified diffs"
|
||||||
|
elif fmt == "folk":
|
||||||
|
label = "Folk remedy prompt (blind, no hands, ...)"
|
||||||
|
color = "#b3e6a8"
|
||||||
|
hatch = "////"
|
||||||
|
|
||||||
|
if zorder > 1:
|
||||||
|
edge = dict(
|
||||||
|
edgecolor="#ffffff",
|
||||||
|
linewidth=1.5,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
edge = dict()
|
||||||
|
if zorder == 2:
|
||||||
|
edge["label"] = label
|
||||||
|
|
||||||
|
rects = ax.bar(
|
||||||
|
pos + i * width,
|
||||||
|
df[fmt],
|
||||||
|
width * 0.95,
|
||||||
|
color=color,
|
||||||
|
hatch=hatch,
|
||||||
|
zorder=zorder,
|
||||||
|
**edge,
|
||||||
|
)
|
||||||
|
|
||||||
|
if zorder == 2:
|
||||||
|
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
|
||||||
|
|
||||||
|
ax.set_xticks([p + 0.5 * width for p in pos])
|
||||||
|
ax.set_xticklabels(models)
|
||||||
|
|
||||||
|
ax.set_ylabel("Percent of exercises completed successfully")
|
||||||
|
# ax.set_xlabel("Model")
|
||||||
|
ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
|
||||||
|
ax.legend(
|
||||||
|
title="Edit Format",
|
||||||
|
loc="upper left",
|
||||||
|
# bbox_to_anchor=(0.95, 0.95),
|
||||||
|
)
|
||||||
|
ax.set_ylim(top=100)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig("tmp.svg")
|
||||||
|
imgcat(fig)
|
||||||
|
|
||||||
|
# df.to_csv("tmp.benchmarks.csv")
|
||||||
|
|
||||||
|
|
||||||
def resolve_dirname(dirname, use_single_prior, make_new):
|
def resolve_dirname(dirname, use_single_prior, make_new):
|
||||||
if len(dirname.parts) > 1:
|
if len(dirname.parts) > 1:
|
||||||
return dirname
|
return dirname
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
# Fixing GPT-4 Turbo laziness with unified diffs
|
# Fixing GPT-4 Turbo laziness with unified diffs
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
Aider now asks GPT-4 Turbo to use
|
Aider now asks GPT-4 Turbo to use
|
||||||
|
@ -15,10 +15,8 @@ Aider also has a new benchmarking suite
|
||||||
designed to both provoke and quantify lazy coding.
|
designed to both provoke and quantify lazy coding.
|
||||||
It consists of
|
It consists of
|
||||||
39 python refactoring tasks,
|
39 python refactoring tasks,
|
||||||
which ask GPT to remove a non-trivial method from a class and make it
|
which tend to make GPT-4 Turbo very lazy,
|
||||||
a stand alone function.
|
often resulting in comments like
|
||||||
GPT-4 Turbo is prone to being lazy on this sort of task,
|
|
||||||
often leaving comments like
|
|
||||||
"...include the original method body...".
|
"...include the original method body...".
|
||||||
|
|
||||||
This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
|
This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
|
||||||
|
@ -56,8 +54,8 @@ and evaluate their significance using ablation experiments.
|
||||||
## Unified diff editing format
|
## Unified diff editing format
|
||||||
|
|
||||||
The design and implementation of aider's new unified diff editing format
|
The design and implementation of aider's new unified diff editing format
|
||||||
helped clarify some general principles, which I think are applicable to any effective
|
helped clarify some general principles
|
||||||
GPT-4 code editing format:
|
for GPT-4 code editing:
|
||||||
|
|
||||||
- FAMILIAR - Choose an edit format that GPT is already familiar with.
|
- FAMILIAR - Choose an edit format that GPT is already familiar with.
|
||||||
- SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts.
|
- SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts.
|
||||||
|
@ -68,9 +66,7 @@ A helpful shortcut here is to have empathy for GPT, and imagine you
|
||||||
are the one being asked to specify code edits.
|
are the one being asked to specify code edits.
|
||||||
Would you want to hand type a properly escaped json data structure
|
Would you want to hand type a properly escaped json data structure
|
||||||
to invoke surgical insert, delete, replace operations on specific code line numbers?
|
to invoke surgical insert, delete, replace operations on specific code line numbers?
|
||||||
How would you feel about
|
How would you feel about any mistake causing all your work to be discarded?
|
||||||
errors firing
|
|
||||||
after any typo, off-by-one line number or flubbed escape sequence?
|
|
||||||
|
|
||||||
GPT is quantitatively better at code editing when you reduce the
|
GPT is quantitatively better at code editing when you reduce the
|
||||||
burden of formatting edits by using a familiar, simple, high level
|
burden of formatting edits by using a familiar, simple, high level
|
||||||
|
@ -93,7 +89,7 @@ default output format of `git diff`:
|
||||||
return
|
return
|
||||||
```
|
```
|
||||||
|
|
||||||
Choosing such a familiar, popular output format means that GPT has
|
Choosing such a popular output format means that GPT has
|
||||||
seen *many* examples in its training data.
|
seen *many* examples in its training data.
|
||||||
It's been trained to generate
|
It's been trained to generate
|
||||||
text that conforms to the unified diff syntax.
|
text that conforms to the unified diff syntax.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue