graph layout

2025-06-02 10:45:00 +00:00 · 2023-07-01 17:18:14 -07:00 · 2023-07-01 17:18:14 -07:00 · 8ef166478a
commit 8ef166478a
parent 7db384fc63
3 changed files with 331 additions and 313 deletions
--- a/assets/benchmarks.svg
+++ b/assets/benchmarks.svg
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -54,6 +54,11 @@ def show_stats(dirnames):
        if row.edit_format == "diff-func-string":
            row.edit_format = "diff-func"

+        pieces = row.model.split("-")
+        row.model = "-".join(pieces[:3])
+        if pieces[3:]:
+            row.model += "\n-" + "-".join(pieces[3:])
+
        if row.completed_tests < 133:
            print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")

@ -83,7 +88,7 @@ def show_stats(dirnames):

    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 22})

-    fig, ax = plt.subplots(figsize=(12, 10))
+    fig, ax = plt.subplots(figsize=(12, 8))
    ax.grid(axis="y", zorder=0, lw=0.2)

    zorder = 1
@ -121,7 +126,7 @@ def show_stats(dirnames):
                **edge,
            )
            if zorder == 2:
-                ax.bar_label(rects, padding=8, labels=[f"{v:.0f}%" for v in df[fmt]], size=11)
+                ax.bar_label(rects, padding=8, labels=[f"{v:.0f}%" for v in df[fmt]], size=14)

    if repeats:
        repeats = pd.DataFrame.from_records(repeats)
@ -133,12 +138,12 @@ def show_stats(dirnames):
        ax.errorbar(1.4, 44, yerr=[[lo], [hi]], fmt="none", zorder=5, capsize=5)

    ax.set_xticks([p + 1.5 * width for p in pos])
-    ax.set_xticklabels(models, rotation=45)
+    ax.set_xticklabels(models)

    ax.annotate(
        "First coding\nattempt",
-        xy=(2.9, 50),
-        xytext=(2.25, 88),
+        xy=(2.9, 51),
+        xytext=(2.25, 85),
        horizontalalignment="center",
        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
    )
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@ -235,8 +235,10 @@ The benchmark results have me fairly convinced that the new
 `gpt-3.5-turbo-0613` and `gpt-3.5-16k-0613` models
 are a bit worse at code editing than
 the older `gpt-3.5-turbo-0301` model.
-This is especially visible if you look at just the "first coding attempt"
+This is especially visible in the "first coding attempt"
 portion of each result, before GPT gets a second chance to edit the code.
+Look at the horizontal white line in the middle of the first three blue bars.
+
 Performance with the `whole` edit format was 46% for the
 February model and only 39% for the June models.
 I saw other signs of this degraded performance