diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index d541fb991..4d111dbde 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -34,6 +34,9 @@ EXERCISES_DIR_DEFAULT = "exercism-python" app = typer.Typer(add_completion=False, pretty_exceptions_enable=False) +NUM_TESTS = (89, 133) + + def show_stats(dirnames, graphs): raw_rows = [] for dirname in dirnames: @@ -48,7 +51,7 @@ def show_stats(dirnames, graphs): if not row: continue - if row.completed_tests not in (89, 133): + if row.completed_tests not in NUM_TESTS: print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}") kind = (row.model, row.edit_format) @@ -356,7 +359,25 @@ def summarize_results(dirname): console = Console(highlight=False) console.rule(title=str(dirname)) - console.print(f"test-cases: {res.completed_tests}") + commit_hashes = variants["commit_hash"] + versions = get_versions(commit_hashes) + date = dirname.name[:10] + + def show(stat, red="red"): + val = getattr(res, stat) + style = red if val else None + console.print(f" {stat}: {val}", style=style) + + percents = dict() + for i in range(tries): + pass_rate = 100 * passed_tests[i] / res.completed_tests + percents[i] = pass_rate + # console.print(f"{pass_rate:.1f}% correct after try {i+1}") + setattr(res, f"pass_rate_{i+1}", f"{pass_rate:.1f}") + + print(f"- dirname: {dirname.name}") + style = None if res.completed_tests in NUM_TESTS else "red" + console.print(f" test_cases: {res.completed_tests}", style=style) for key, val in variants.items(): if len(val) > 1: style = "red" @@ -364,42 +385,41 @@ def summarize_results(dirname): style = None val = ", ".join(map(str, val)) setattr(res, key, val) - console.print(f"{key}: {val}", style=style) + console.print(f" {key}: {val}", style=style) - def show(stat): - val = getattr(res, stat) - style = "red" if val else None - console.print(f"{stat}: {val}", style=style) + for i in range(tries): + print(f" pass_rate_{i+1}: {percents[i]:.1f}") + + pct_well_formed = 1.0 - res.num_malformed_responses / res.completed_tests + print(f" percent_cases_well_formed: {pct_well_formed*100:.1f}") - console.print() show("error_outputs") + show("num_malformed_responses") show("user_asks") show("lazy_comments") - show("num_malformed_responses") show("syntax_errors") show("indentation_errors") - console.print() show("exhausted_context_windows") show("test_timeouts") - console.print() - percents = dict() - for i in range(tries): - pass_rate = 100 * passed_tests[i] / res.completed_tests - percents[i] = pass_rate - console.print(f"{pass_rate:.1f}% correct after try {i}") - setattr(res, f"pass_rate_{i+1}", pass_rate) + a_model = set(variants["model"]).pop() + command = f"aider --model {a_model}" + print(f" command: {command}") + + print(f" date: {date}") + print(" versions:", ",".join(versions)) - console.print() res.avg_duration = res.duration / res.completed_tests + print(f" seconds_per_case: {res.avg_duration:.1f}") - console.print(f"duration: {res.avg_duration:.1f} sec/test-case") + print(f" total_cost: {res.cost:.4f}") res.avg_cost = res.cost / res.completed_tests projected_cost = res.avg_cost * res.total_tests - console.print( + print() + print( f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total," f" ${projected_cost:.2f} projected" ) @@ -413,21 +433,8 @@ def summarize_results(dirname): csv.append(f"{first:.1f}") csv.append(" ".join(variants["edit_format"])) - model = variants["model"].pop() - csv.append(f"aider --model {model}") - versions = set() - for hsh in variants["commit_hash"]: - if not hsh: - continue - hsh = hsh.split("-")[0] - try: - version = subprocess.check_output( - ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True - ) - version = re.search(r'__version__ = "(.*)"', version).group(1) - versions.add(version) - except subprocess.CalledProcessError: - pass + csv.append(command) + csv.append(" ".join(sorted(versions))) commit_hashes = variants.get("commit_hash", []) if all(commit_hashes): @@ -445,6 +452,23 @@ def summarize_results(dirname): return res +def get_versions(commit_hashes): + versions = set() + for hsh in commit_hashes: + if not hsh: + continue + hsh = hsh.split("-")[0] + try: + version = subprocess.check_output( + ["git", "show", f"{hsh}:aider/__init__.py"], universal_newlines=True + ) + version = re.search(r'__version__ = "(.*)"', version).group(1) + versions.add(version) + except subprocess.CalledProcessError: + pass + return versions + + def get_replayed_content(replay_dname, test_dname): replay_dname = Path(replay_dname) test_dname = Path(test_dname)