graph layout

This commit is contained in:
Paul Gauthier 2023-07-01 17:18:14 -07:00
parent 7db384fc63
commit 8ef166478a
3 changed files with 331 additions and 313 deletions

File diff suppressed because it is too large Load diff

Before

Width:  |  Height:  |  Size: 75 KiB

After

Width:  |  Height:  |  Size: 74 KiB

Before After
Before After

View file

@ -54,6 +54,11 @@ def show_stats(dirnames):
if row.edit_format == "diff-func-string":
row.edit_format = "diff-func"
pieces = row.model.split("-")
row.model = "-".join(pieces[:3])
if pieces[3:]:
row.model += "\n-" + "-".join(pieces[3:])
if row.completed_tests < 133:
print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
@ -83,7 +88,7 @@ def show_stats(dirnames):
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 22})
fig, ax = plt.subplots(figsize=(12, 10))
fig, ax = plt.subplots(figsize=(12, 8))
ax.grid(axis="y", zorder=0, lw=0.2)
zorder = 1
@ -121,7 +126,7 @@ def show_stats(dirnames):
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=8, labels=[f"{v:.0f}%" for v in df[fmt]], size=11)
ax.bar_label(rects, padding=8, labels=[f"{v:.0f}%" for v in df[fmt]], size=14)
if repeats:
repeats = pd.DataFrame.from_records(repeats)
@ -133,12 +138,12 @@ def show_stats(dirnames):
ax.errorbar(1.4, 44, yerr=[[lo], [hi]], fmt="none", zorder=5, capsize=5)
ax.set_xticks([p + 1.5 * width for p in pos])
ax.set_xticklabels(models, rotation=45)
ax.set_xticklabels(models)
ax.annotate(
"First coding\nattempt",
xy=(2.9, 50),
xytext=(2.25, 88),
xy=(2.9, 51),
xytext=(2.25, 85),
horizontalalignment="center",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)

View file

@ -235,8 +235,10 @@ The benchmark results have me fairly convinced that the new
`gpt-3.5-turbo-0613` and `gpt-3.5-16k-0613` models
are a bit worse at code editing than
the older `gpt-3.5-turbo-0301` model.
This is especially visible if you look at just the "first coding attempt"
This is especially visible in the "first coding attempt"
portion of each result, before GPT gets a second chance to edit the code.
Look at the horizontal white line in the middle of the first three blue bars.
Performance with the `whole` edit format was 46% for the
February model and only 39% for the June models.
I saw other signs of this degraded performance