aider/benchmark/plots.py
2024-05-06 10:44:34 -07:00

417 lines
11 KiB
Python

import matplotlib.pyplot as plt
import numpy as np
from imgcat import imgcat
from aider.dump import dump # noqa: F401
def plot_timing(df):
"""plot a graph showing the average duration of each (model, edit_format)"""
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
grouped = df.groupby(["model", "edit_format"])["avg_duration"].mean().unstack()
num_models, num_formats = grouped.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = grouped.columns
models = grouped.index
for i, fmt in enumerate(formats):
edge = dict(edgecolor="#ffffff", linewidth=1.5)
color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
hatch = "////" if "func" in fmt else ""
rects = ax.bar(
pos + i * width,
grouped[fmt],
width * 0.95,
label=fmt,
color=color,
hatch=hatch,
zorder=zorder + 1,
**edge,
)
ax.bar_label(rects, padding=4, labels=[f"{v:.1f}s" for v in grouped[fmt]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
ax.set_xticklabels(models)
ax.set_ylabel("Average GPT response time\nper exercise (sec)")
ax.set_title("GPT Code Editing Speed\n(time per coding task)")
ax.legend(
title="Edit Format",
loc="upper left",
)
ax.set_ylim(top=max(grouped.max()) * 1.1) # Set y-axis limit to 10% more than the max value
plt.tight_layout()
plt.savefig("tmp_timing.svg")
imgcat(fig)
def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()]
if True:
tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
for i, fmt in enumerate(formats):
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = fmt
color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
hatch = "////" if "func" in fmt else ""
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
if len(repeats):
ax.errorbar(
1.4,
repeat_avg,
yerr=[[repeat_lo], [repeat_hi]],
fmt="none",
zorder=5,
capsize=2.5,
elinewidth=1,
markeredgewidth=1,
)
ax.set_xticks([p + 0.5 * width for p in pos])
model_labels = []
for model in models:
pieces = model.split("-")
ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
model_labels.append(ml)
ax.set_xticklabels(model_labels)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
xy=(2.20, 41),
xytext=(2, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
xy=(2.55, 56),
xytext=(3.5, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title("GPT Code Editing Skill\n(percent coding tasks correct)")
ax.legend(
title="Edit Format",
loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
)
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def plot_outcomes_claude(df):
print(df)
# Fix wrong column label
df["model"] = df["model"].replace("gpt-4-0314", "gpt-4-0613")
tries = [
df[["model", "pass_rate_2"]],
df[["model", "pass_rate_1"]],
]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for df in tries:
zorder += 1
print(df)
num_models, _ = df.shape
num_formats = 1
pos = np.array(range(num_models))
width = 0.6 / num_formats
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = "??"
color = [
"#b3e6a8",
"#b3e6a8",
"#b3e6a8",
"#b3d1e6",
]
hatch = [ # noqa: F841
"",
"",
"",
"",
"////",
"////",
"////",
"",
"////",
]
hatch = [ # noqa: F841
"////",
"////",
"////",
"////",
"",
"",
"",
"////",
"",
]
rects = ax.bar(
pos + 0.5 * width,
df.iloc[:, 1],
width * 0.95,
color=color,
# hatch=hatch,
# zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
models = df.iloc[:, 0]
model_map = {
"gpt-4-0613": "gpt-4-\n0613",
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
}
model_labels = []
for model in models:
ml = model_map.get(model, model)
model_labels.append(ml)
ax.set_xticklabels(model_labels, rotation=0)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
xy=(1.0, 53),
xytext=(0.75, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
xy=(1.55, 65),
xytext=(1.9, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title("Code Editing Skill")
# ax.legend(
# title="Model family",
# loc="upper left",
# )
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def plot_refactoring(df):
tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
i, j = 0, 1
temp = df.iloc[i].copy()
df.iloc[i], df.iloc[j] = df.iloc[j], temp
dump(df)
# df.sort_values(by=["model"], ascending=False, inplace=True)
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
dump(df)
dump(models)
dump(formats)
for i, fmt in enumerate(formats):
hatch = ""
if fmt == "diff":
color = "#b3e6a8"
label = "Search/replace blocks"
elif fmt == "udiff":
color = "#b3d1e6"
label = "Unified diffs"
elif fmt == "difffolk":
label = "Baseline + blind, no hands, $2k tip, etc"
color = "#b3e6a8"
hatch = "////"
elif fmt == "udifffolk":
label = "Unified diffs + blind, no hands, $2k tip, etc"
color = "#b3d1e6"
hatch = "////"
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = label
color = [
"#b3e6a8",
"#b3e6a8",
"#b3d1e6",
]
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
ax.set_xticks([p + 0 * width for p in pos])
model_map = {
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
}
model_labels = []
for model in models:
ml = model_map.get(model, model)
model_labels.append(ml)
model_labels = [
"gpt-4-\n1106-preview",
"gpt-4-\n0125-preview",
"gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
]
ax.set_xticklabels(model_labels, rotation=0)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title('Refactoring "Laziness" Benchmark')
# ax.legend(
# title="Edit Format",
# loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
# )
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")