format output as yaml

2025-05-28 16:25:00 +00:00 · 2024-05-06 11:15:19 -07:00 · 2024-05-06 11:15:19 -07:00 · a7b08c7354
commit a7b08c7354
parent 3162d42262
1 changed files with 59 additions and 35 deletions
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -34,6 +34,9 @@ EXERCISES_DIR_DEFAULT = "exercism-python"
 app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)


+NUM_TESTS = (89, 133)
+
+
 def show_stats(dirnames, graphs):
    raw_rows = []
    for dirname in dirnames:
@ -48,7 +51,7 @@ def show_stats(dirnames, graphs):
        if not row:
            continue

-        if row.completed_tests not in (89, 133):
+        if row.completed_tests not in NUM_TESTS:
            print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")

        kind = (row.model, row.edit_format)
@ -356,7 +359,25 @@ def summarize_results(dirname):
    console = Console(highlight=False)
    console.rule(title=str(dirname))

-    console.print(f"test-cases: {res.completed_tests}")
+    commit_hashes = variants["commit_hash"]
+    versions = get_versions(commit_hashes)
+    date = dirname.name[:10]
+
+    def show(stat, red="red"):
+        val = getattr(res, stat)
+        style = red if val else None
+        console.print(f"  {stat}: {val}", style=style)
+
+    percents = dict()
+    for i in range(tries):
+        pass_rate = 100 * passed_tests[i] / res.completed_tests
+        percents[i] = pass_rate
+        # console.print(f"{pass_rate:.1f}% correct after try {i+1}")
+        setattr(res, f"pass_rate_{i+1}", f"{pass_rate:.1f}")
+
+    print(f"- dirname: {dirname.name}")
+    style = None if res.completed_tests in NUM_TESTS else "red"
+    console.print(f"  test_cases: {res.completed_tests}", style=style)
    for key, val in variants.items():
        if len(val) > 1:
            style = "red"
@ -364,42 +385,41 @@ def summarize_results(dirname):
            style = None
        val = ", ".join(map(str, val))
        setattr(res, key, val)
-        console.print(f"{key}: {val}", style=style)
+        console.print(f"  {key}: {val}", style=style)

-    def show(stat):
-        val = getattr(res, stat)
-        style = "red" if val else None
-        console.print(f"{stat}: {val}", style=style)
+    for i in range(tries):
+        print(f"  pass_rate_{i+1}: {percents[i]:.1f}")
+
+    pct_well_formed = 1.0 - res.num_malformed_responses / res.completed_tests
+    print(f"  percent_cases_well_formed: {pct_well_formed*100:.1f}")

-    console.print()
    show("error_outputs")
+    show("num_malformed_responses")
    show("user_asks")
    show("lazy_comments")
-    show("num_malformed_responses")
    show("syntax_errors")
    show("indentation_errors")
-    console.print()
    show("exhausted_context_windows")
    show("test_timeouts")

-    console.print()
-    percents = dict()
-    for i in range(tries):
-        pass_rate = 100 * passed_tests[i] / res.completed_tests
-        percents[i] = pass_rate
-        console.print(f"{pass_rate:.1f}% correct after try {i}")
-        setattr(res, f"pass_rate_{i+1}", pass_rate)
+    a_model = set(variants["model"]).pop()
+    command = f"aider --model {a_model}"
+    print(f"  command: {command}")
+
+    print(f"  date: {date}")
+    print("  versions:", ",".join(versions))

-    console.print()
    res.avg_duration = res.duration / res.completed_tests
+    print(f"  seconds_per_case: {res.avg_duration:.1f}")

-    console.print(f"duration: {res.avg_duration:.1f} sec/test-case")
+    print(f"  total_cost: {res.cost:.4f}")

    res.avg_cost = res.cost / res.completed_tests

    projected_cost = res.avg_cost * res.total_tests

-    console.print(
+    print()
+    print(
        f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total,"
        f" ${projected_cost:.2f} projected"
    )
@ -413,21 +433,8 @@ def summarize_results(dirname):
    csv.append(f"{first:.1f}")

    csv.append(" ".join(variants["edit_format"]))
-    model = variants["model"].pop()
-    csv.append(f"aider --model {model}")
-    versions = set()
-    for hsh in variants["commit_hash"]:
-        if not hsh:
-            continue
-        hsh = hsh.split("-")[0]
-        try:
-            version = subprocess.check_output(
-                ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True
-            )
-            version = re.search(r'__version__ = "(.*)"', version).group(1)
-            versions.add(version)
-        except subprocess.CalledProcessError:
-            pass
+    csv.append(command)
+
    csv.append(" ".join(sorted(versions)))
    commit_hashes = variants.get("commit_hash", [])
    if all(commit_hashes):
@ -445,6 +452,23 @@ def summarize_results(dirname):
    return res


+def get_versions(commit_hashes):
+    versions = set()
+    for hsh in commit_hashes:
+        if not hsh:
+            continue
+        hsh = hsh.split("-")[0]
+        try:
+            version = subprocess.check_output(
+                ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True
+            )
+            version = re.search(r'__version__ = "(.*)"', version).group(1)
+            versions.add(version)
+        except subprocess.CalledProcessError:
+            pass
+    return versions
+
+
 def get_replayed_content(replay_dname, test_dname):
    replay_dname = Path(replay_dname)
    test_dname = Path(test_dname)