refactored plots

This commit is contained in:
Paul Gauthier 2024-05-06 10:43:39 -07:00
parent 3bb237bdc1
commit 5fb7a323ec
2 changed files with 418 additions and 415 deletions

View file

@ -16,12 +16,10 @@ from typing import List
import git
import lox
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import prompts
import typer
from imgcat import imgcat
from plots import plot_refactoring
from rich.console import Console
from aider import models
@ -130,418 +128,6 @@ def show_stats(dirnames, graphs):
plot_refactoring(df)
def plot_timing(df):
"""plot a graph showing the average duration of each (model, edit_format)"""
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
grouped = df.groupby(["model", "edit_format"])["avg_duration"].mean().unstack()
num_models, num_formats = grouped.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = grouped.columns
models = grouped.index
for i, fmt in enumerate(formats):
edge = dict(edgecolor="#ffffff", linewidth=1.5)
color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
hatch = "////" if "func" in fmt else ""
rects = ax.bar(
pos + i * width,
grouped[fmt],
width * 0.95,
label=fmt,
color=color,
hatch=hatch,
zorder=zorder + 1,
**edge,
)
ax.bar_label(rects, padding=4, labels=[f"{v:.1f}s" for v in grouped[fmt]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
ax.set_xticklabels(models)
ax.set_ylabel("Average GPT response time\nper exercise (sec)")
ax.set_title("GPT Code Editing Speed\n(time per coding task)")
ax.legend(
title="Edit Format",
loc="upper left",
)
ax.set_ylim(top=max(grouped.max()) * 1.1) # Set y-axis limit to 10% more than the max value
plt.tight_layout()
plt.savefig("tmp_timing.svg")
imgcat(fig)
def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()]
if True:
tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
for i, fmt in enumerate(formats):
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = fmt
color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
hatch = "////" if "func" in fmt else ""
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
if len(repeats):
ax.errorbar(
1.4,
repeat_avg,
yerr=[[repeat_lo], [repeat_hi]],
fmt="none",
zorder=5,
capsize=2.5,
elinewidth=1,
markeredgewidth=1,
)
ax.set_xticks([p + 0.5 * width for p in pos])
model_labels = []
for model in models:
pieces = model.split("-")
ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
model_labels.append(ml)
ax.set_xticklabels(model_labels)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
xy=(2.20, 41),
xytext=(2, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
xy=(2.55, 56),
xytext=(3.5, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title("GPT Code Editing Skill\n(percent coding tasks correct)")
ax.legend(
title="Edit Format",
loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
)
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def plot_outcomes_claude(df):
print(df)
# Fix wrong column label
df["model"] = df["model"].replace("gpt-4-0314", "gpt-4-0613")
tries = [
df[["model", "pass_rate_2"]],
df[["model", "pass_rate_1"]],
]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for df in tries:
zorder += 1
print(df)
num_models, _ = df.shape
num_formats = 1
pos = np.array(range(num_models))
width = 0.6 / num_formats
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = "??"
color = [
"#b3e6a8",
"#b3e6a8",
"#b3e6a8",
"#b3d1e6",
]
hatch = [ # noqa: F841
"",
"",
"",
"",
"////",
"////",
"////",
"",
"////",
]
hatch = [ # noqa: F841
"////",
"////",
"////",
"////",
"",
"",
"",
"////",
"",
]
rects = ax.bar(
pos + 0.5 * width,
df.iloc[:, 1],
width * 0.95,
color=color,
# hatch=hatch,
# zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
models = df.iloc[:, 0]
model_map = {
"gpt-4-0613": "gpt-4-\n0613",
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
}
model_labels = []
for model in models:
ml = model_map.get(model, model)
model_labels.append(ml)
ax.set_xticklabels(model_labels, rotation=0)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
xy=(1.0, 53),
xytext=(0.75, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
xy=(1.55, 65),
xytext=(1.9, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title("Code Editing Skill")
# ax.legend(
# title="Model family",
# loc="upper left",
# )
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def plot_refactoring(df):
tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
i, j = 0, 1
temp = df.iloc[i].copy()
df.iloc[i], df.iloc[j] = df.iloc[j], temp
dump(df)
# df.sort_values(by=["model"], ascending=False, inplace=True)
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
dump(df)
dump(models)
dump(formats)
for i, fmt in enumerate(formats):
hatch = ""
if fmt == "diff":
color = "#b3e6a8"
label = "Search/replace blocks"
elif fmt == "udiff":
color = "#b3d1e6"
label = "Unified diffs"
elif fmt == "difffolk":
label = "Baseline + blind, no hands, $2k tip, etc"
color = "#b3e6a8"
hatch = "////"
elif fmt == "udifffolk":
label = "Unified diffs + blind, no hands, $2k tip, etc"
color = "#b3d1e6"
hatch = "////"
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = label
color = [
"#b3e6a8",
"#b3e6a8",
"#b3d1e6",
]
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
ax.set_xticks([p + 0 * width for p in pos])
model_map = {
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
}
model_labels = []
for model in models:
ml = model_map.get(model, model)
model_labels.append(ml)
model_labels = [
"gpt-4-\n1106-preview",
"gpt-4-\n0125-preview",
"gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
]
ax.set_xticklabels(model_labels, rotation=0)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title('Refactoring "Laziness" Benchmark')
# ax.legend(
# title="Edit Format",
# loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
# )
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def resolve_dirname(dirname, use_single_prior, make_new):
if len(dirname.parts) > 1:
return dirname

417
benchmark/plots.py Normal file
View file

@ -0,0 +1,417 @@
import matplotlib.pyplot as plt
import numpy as np
from imgcat import imgcat
from aider.dump import dump # noqa: F401
def plot_timing(df):
"""plot a graph showing the average duration of each (model, edit_format)"""
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
grouped = df.groupby(["model", "edit_format"])["avg_duration"].mean().unstack()
num_models, num_formats = grouped.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = grouped.columns
models = grouped.index
for i, fmt in enumerate(formats):
edge = dict(edgecolor="#ffffff", linewidth=1.5)
color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
hatch = "////" if "func" in fmt else ""
rects = ax.bar(
pos + i * width,
grouped[fmt],
width * 0.95,
label=fmt,
color=color,
hatch=hatch,
zorder=zorder + 1,
**edge,
)
ax.bar_label(rects, padding=4, labels=[f"{v:.1f}s" for v in grouped[fmt]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
ax.set_xticklabels(models)
ax.set_ylabel("Average GPT response time\nper exercise (sec)")
ax.set_title("GPT Code Editing Speed\n(time per coding task)")
ax.legend(
title="Edit Format",
loc="upper left",
)
ax.set_ylim(top=max(grouped.max()) * 1.1) # Set y-axis limit to 10% more than the max value
plt.tight_layout()
plt.savefig("tmp_timing.svg")
imgcat(fig)
def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()]
if True:
tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
for i, fmt in enumerate(formats):
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = fmt
color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
hatch = "////" if "func" in fmt else ""
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
if len(repeats):
ax.errorbar(
1.4,
repeat_avg,
yerr=[[repeat_lo], [repeat_hi]],
fmt="none",
zorder=5,
capsize=2.5,
elinewidth=1,
markeredgewidth=1,
)
ax.set_xticks([p + 0.5 * width for p in pos])
model_labels = []
for model in models:
pieces = model.split("-")
ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
model_labels.append(ml)
ax.set_xticklabels(model_labels)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
xy=(2.20, 41),
xytext=(2, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
xy=(2.55, 56),
xytext=(3.5, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title("GPT Code Editing Skill\n(percent coding tasks correct)")
ax.legend(
title="Edit Format",
loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
)
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def plot_outcomes_claude(df):
print(df)
# Fix wrong column label
df["model"] = df["model"].replace("gpt-4-0314", "gpt-4-0613")
tries = [
df[["model", "pass_rate_2"]],
df[["model", "pass_rate_1"]],
]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for df in tries:
zorder += 1
print(df)
num_models, _ = df.shape
num_formats = 1
pos = np.array(range(num_models))
width = 0.6 / num_formats
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = "??"
color = [
"#b3e6a8",
"#b3e6a8",
"#b3e6a8",
"#b3d1e6",
]
hatch = [ # noqa: F841
"",
"",
"",
"",
"////",
"////",
"////",
"",
"////",
]
hatch = [ # noqa: F841
"////",
"////",
"////",
"////",
"",
"",
"",
"////",
"",
]
rects = ax.bar(
pos + 0.5 * width,
df.iloc[:, 1],
width * 0.95,
color=color,
# hatch=hatch,
# zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
models = df.iloc[:, 0]
model_map = {
"gpt-4-0613": "gpt-4-\n0613",
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
}
model_labels = []
for model in models:
ml = model_map.get(model, model)
model_labels.append(ml)
ax.set_xticklabels(model_labels, rotation=0)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
xy=(1.0, 53),
xytext=(0.75, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
xy=(1.55, 65),
xytext=(1.9, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title("Code Editing Skill")
# ax.legend(
# title="Model family",
# loc="upper left",
# )
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")
def plot_refactoring(df):
tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
from matplotlib import rc
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(6, 4))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
for grouped in tries:
zorder += 1
df = grouped.unstack()
i, j = 0, 1
temp = df.iloc[i].copy()
df.iloc[i], df.iloc[j] = df.iloc[j], temp
dump(df)
# df.sort_values(by=["model"], ascending=False, inplace=True)
num_models, num_formats = df.shape
pos = np.array(range(num_models))
width = 0.8 / num_formats
formats = df.columns
models = df.index
dump(df)
dump(models)
dump(formats)
for i, fmt in enumerate(formats):
hatch = ""
if fmt == "diff":
color = "#b3e6a8"
label = "Search/replace blocks"
elif fmt == "udiff":
color = "#b3d1e6"
label = "Unified diffs"
elif fmt == "difffolk":
label = "Baseline + blind, no hands, $2k tip, etc"
color = "#b3e6a8"
hatch = "////"
elif fmt == "udifffolk":
label = "Unified diffs + blind, no hands, $2k tip, etc"
color = "#b3d1e6"
hatch = "////"
if zorder > 1:
edge = dict(
edgecolor="#ffffff",
linewidth=1.5,
)
else:
edge = dict()
if zorder == 2:
edge["label"] = label
color = [
"#b3e6a8",
"#b3e6a8",
"#b3d1e6",
]
rects = ax.bar(
pos + i * width,
df[fmt],
width * 0.95,
color=color,
hatch=hatch,
zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
ax.set_xticks([p + 0 * width for p in pos])
model_map = {
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
}
model_labels = []
for model in models:
ml = model_map.get(model, model)
model_labels.append(ml)
model_labels = [
"gpt-4-\n1106-preview",
"gpt-4-\n0125-preview",
"gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
]
ax.set_xticklabels(model_labels, rotation=0)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
ax.set_title('Refactoring "Laziness" Benchmark')
# ax.legend(
# title="Edit Format",
# loc="upper left",
# bbox_to_anchor=(0.95, 0.95),
# )
ax.set_ylim(top=100)
plt.tight_layout()
plt.savefig("tmp.svg")
imgcat(fig)
# df.to_csv("tmp.benchmarks.csv")