better graph

This commit is contained in:
Paul Gauthier 2023-12-18 10:02:44 -08:00
parent 7824faed64
commit 16534e914b
2 changed files with 580 additions and 590 deletions

File diff suppressed because it is too large Load diff

Before

Width:  |  Height:  |  Size: 44 KiB

After

Width:  |  Height:  |  Size: 45 KiB

Before After
Before After

View file

@ -69,19 +69,18 @@ def show_stats(dirnames, graphs):
# remember this row, so we can update it with the repeat_avg # remember this row, so we can update it with the repeat_avg
repeat_row = len(rows) repeat_row = len(rows)
gpt35 = "gpt-3.5-turbo" # gpt35 = "gpt-3.5-turbo"
gpt4 = "gpt-4" # gpt4 = "gpt-4"
# if row.model.startswith(gpt35):
if row.model.startswith(gpt35): # row.model = gpt35 + "\n" + row.model[len(gpt35) :]
row.model = gpt35 + "\n" + row.model[len(gpt35) :] # elif row.model.startswith(gpt4):
elif row.model.startswith(gpt4): # row.model = gpt4 + "\n" + row.model[len(gpt4) :]
row.model = gpt4 + "\n" + row.model[len(gpt4) :]
if "folk" in row.dir_name: if "folk" in row.dir_name:
row.edit_format = "folk" row.edit_format = "folk"
# if row.model == "gpt-4\n-1106-preview": if row.model == "gpt-4-0613":
# row.model += "\n(preliminary)" row.model += "\n(8k context window is\ntoo small for benchmark)"
if row.completed_tests < 133: if row.completed_tests < 133:
print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}") print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
@ -322,7 +321,7 @@ def plot_refactoring(df):
color = "#b3d1e6" color = "#b3d1e6"
label = "Unified diffs" label = "Unified diffs"
elif fmt == "folk": elif fmt == "folk":
label = "Folk remedy prompt (blind, no hands, ...)" label = "Folk remedy prompt (blind, no hands, etc)"
color = "#b3e6a8" color = "#b3e6a8"
hatch = "////" hatch = "////"
@ -349,14 +348,14 @@ def plot_refactoring(df):
if zorder == 2: if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6) ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos]) ax.set_xticks([p + 1.0 * width for p in pos])
ax.set_xticklabels(models) ax.set_xticklabels(models)
ax.set_ylabel("Percent of exercises completed successfully") ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model") # ax.set_xlabel("Model")
ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)') ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
ax.legend( ax.legend(
title="Edit Format", # title="Edit Format",
loc="upper left", loc="upper left",
# bbox_to_anchor=(0.95, 0.95), # bbox_to_anchor=(0.95, 0.95),
) )