diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 80a511b63..d32433b16 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -54,6 +54,14 @@ def show_stats(dirnames): if row.edit_format == "diff-func-string": row.edit_format = "diff-func" + if ( + row.model == "gpt-3.5-turbo-0613" + and row.edit_format == "whole" + and "repeat" not in row.dir_name + ): + # remember this row, so we can update it with the repeat_avg + repeat_row = len(rows) + pieces = row.model.split("-") row.model = "-".join(pieces[:3]) if pieces[3:]: @@ -71,9 +79,29 @@ def show_stats(dirnames): dump(row.dir_name) dump(seen[kind]) return + seen[kind] = row.dir_name rows.append(vars(row)) + if repeats: + extra = rows[repeat_row] + dump(extra) + repeats.append(extra) + repeats = pd.DataFrame.from_records(repeats) + repeat_max = repeats["pass_rate_2"].max() + repeat_min = repeats["pass_rate_2"].min() + repeat_avg = repeats["pass_rate_2"].mean() + + repeat_lo = repeat_avg - repeat_min + repeat_hi = repeat_max - repeat_avg + + dump(repeat_max) + dump(repeat_min) + dump(repeat_avg) + + # use the average in the main bar + rows[repeat_row]["pass_rate_2"] = repeat_avg + df = pd.DataFrame.from_records(rows) df.sort_values(by=["model", "edit_format"], inplace=True) @@ -128,14 +156,10 @@ def show_stats(dirnames): if zorder == 2: ax.bar_label(rects, padding=8, labels=[f"{v:.0f}%" for v in df[fmt]], size=14) - if repeats: - repeats = pd.DataFrame.from_records(repeats) - repeat_max = repeats["pass_rate_2"].max() - repeat_min = repeats["pass_rate_2"].min() - - lo = 44 - repeat_min - hi = repeat_max - 44 - ax.errorbar(1.4, 44, yerr=[[lo], [hi]], fmt="none", zorder=5, capsize=5) + if len(repeats): + ax.errorbar( + 1.4, repeat_avg, yerr=[[repeat_lo], [repeat_hi]], fmt="none", zorder=5, capsize=5 + ) ax.set_xticks([p + 1.5 * width for p in pos]) ax.set_xticklabels(models)