format output as yaml

2025-06-01 02:05:00 +00:00 · 2024-05-06 11:15:19 -07:00 · 2024-05-06 11:15:19 -07:00 · a7b08c7354
commit a7b08c7354
parent 3162d42262
1 changed files with 59 additions and 35 deletions
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -34,6 +34,9 @@ EXERCISES_DIR_DEFAULT = "exercism-python"
 app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
 NUM_TESTS = (89, 133)
 def show_stats(dirnames, graphs):
    raw_rows = []
    for dirname in dirnames:
@ -48,7 +51,7 @@ def show_stats(dirnames, graphs):
        if not row:
            continue
-        if row.completed_tests not in (89, 133):
+        if row.completed_tests not in NUM_TESTS:
            print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
        kind = (row.model, row.edit_format)
@ -356,7 +359,25 @@ def summarize_results(dirname):
    console = Console(highlight=False)
    console.rule(title=str(dirname))
-    console.print(f"test-cases: {res.completed_tests}")
+    commit_hashes = variants["commit_hash"]
    versions = get_versions(commit_hashes)
    date = dirname.name[:10]
    def show(stat, red="red"):
        val = getattr(res, stat)
        style = red if val else None
        console.print(f"  {stat}: {val}", style=style)
    percents = dict()
    for i in range(tries):
        pass_rate = 100 * passed_tests[i] / res.completed_tests
        percents[i] = pass_rate
        # console.print(f"{pass_rate:.1f}% correct after try {i+1}")
        setattr(res, f"pass_rate_{i+1}", f"{pass_rate:.1f}")
    print(f"- dirname: {dirname.name}")
    style = None if res.completed_tests in NUM_TESTS else "red"
    console.print(f"  test_cases: {res.completed_tests}", style=style)
    for key, val in variants.items():
        if len(val) > 1:
            style = "red"
@ -364,42 +385,41 @@ def summarize_results(dirname):
            style = None
        val = ", ".join(map(str, val))
        setattr(res, key, val)
-        console.print(f"{key}: {val}", style=style)
+        console.print(f"  {key}: {val}", style=style)
-    def show(stat):
+    for i in range(tries):
-        val = getattr(res, stat)
+        print(f"  pass_rate_{i+1}: {percents[i]:.1f}")
-        style = "red" if val else None
+
-        console.print(f"{stat}: {val}", style=style)
+    pct_well_formed = 1.0 - res.num_malformed_responses / res.completed_tests
    print(f"  percent_cases_well_formed: {pct_well_formed*100:.1f}")
    console.print()
    show("error_outputs")
    show("num_malformed_responses")
    show("user_asks")
    show("lazy_comments")
    show("num_malformed_responses")
    show("syntax_errors")
    show("indentation_errors")
    console.print()
    show("exhausted_context_windows")
    show("test_timeouts")
-    console.print()
+    a_model = set(variants["model"]).pop()
-    percents = dict()
+    command = f"aider --model {a_model}"
-    for i in range(tries):
+    print(f"  command: {command}")
-        pass_rate = 100 * passed_tests[i] / res.completed_tests
+
-        percents[i] = pass_rate
+    print(f"  date: {date}")
-        console.print(f"{pass_rate:.1f}% correct after try {i}")
+    print("  versions:", ",".join(versions))
        setattr(res, f"pass_rate_{i+1}", pass_rate)
    console.print()
    res.avg_duration = res.duration / res.completed_tests
    print(f"  seconds_per_case: {res.avg_duration:.1f}")
-    console.print(f"duration: {res.avg_duration:.1f} sec/test-case")
+    print(f"  total_cost: {res.cost:.4f}")
    res.avg_cost = res.cost / res.completed_tests
    projected_cost = res.avg_cost * res.total_tests
-    console.print(
+    print()
    print(
        f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total,"
        f" ${projected_cost:.2f} projected"
    )
@ -413,21 +433,8 @@ def summarize_results(dirname):
    csv.append(f"{first:.1f}")
    csv.append(" ".join(variants["edit_format"]))
-    model = variants["model"].pop()
+    csv.append(command)
-    csv.append(f"aider --model {model}")
+
    versions = set()
    for hsh in variants["commit_hash"]:
        if not hsh:
            continue
        hsh = hsh.split("-")[0]
        try:
            version = subprocess.check_output(
                ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True
            )
            version = re.search(r'__version__ = "(.*)"', version).group(1)
            versions.add(version)
        except subprocess.CalledProcessError:
            pass
    csv.append(" ".join(sorted(versions)))
    commit_hashes = variants.get("commit_hash", [])
    if all(commit_hashes):
@ -445,6 +452,23 @@ def summarize_results(dirname):
    return res
 def get_versions(commit_hashes):
    versions = set()
    for hsh in commit_hashes:
        if not hsh:
            continue
        hsh = hsh.split("-")[0]
        try:
            version = subprocess.check_output(
                ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True
            )
            version = re.search(r'__version__ = "(.*)"', version).group(1)
            versions.add(version)
        except subprocess.CalledProcessError:
            pass
    return versions
 def get_replayed_content(replay_dname, test_dname):
    replay_dname = Path(replay_dname)
    test_dname = Path(test_dname)