diff --git a/scripts/benchmark.py b/scripts/benchmark.py index a1dddd205..22ece63ca 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -10,6 +10,7 @@ from collections import defaultdict from json.decoder import JSONDecodeError from pathlib import Path +import git import lox from rich.console import Console @@ -26,6 +27,11 @@ assert ORIGINAL_DNAME.exists() and ORIGINAL_DNAME.is_dir() def main(): + repo = git.Repo(search_parent_directories=True) + commit_hash = repo.head.object.hexsha[:7] + if repo.is_dirty(): + commit_hash += "-dirty" + parser = argparse.ArgumentParser(description="Aider Benchmark") parser.add_argument("dirname", type=str, help="Directory name") parser.add_argument("--model", "-m", type=str, help="Model name", default="gpt-3.5-turbo") @@ -117,6 +123,7 @@ def main(): args.no_test, args.verbose, args.stats_only, + commit_hash, ) all_results.append(results) @@ -133,6 +140,7 @@ def main(): args.no_test, args.verbose, args.stats_only, + commit_hash, ) all_results = run_test_threaded.gather(tqdm=True) @@ -172,9 +180,9 @@ def summarize_results(dirname, all_results, total_tests=None): total_cost += results["cost"] duration += results["duration"] - for key in "model edit_format".split(): - if key in results: - variants[key].add(results[key]) + for key in "model edit_format commit_hash".split(): + val = results.get(key) + variants[key].add(val) dump(completed_tests) if not completed_tests: @@ -189,7 +197,7 @@ def summarize_results(dirname, all_results, total_tests=None): style = "red" else: style = None - val = ", ".join(val) + val = ", ".join(map(str, val)) console.print(f"{key}: {val}", style=style) console.print() @@ -213,7 +221,7 @@ def summarize_results(dirname, all_results, total_tests=None): console.rule() -def run_test(testdir, model_name, edit_format, retries, no_test, verbose, stats_only): +def run_test(testdir, model_name, edit_format, retries, no_test, verbose, stats_only, commit_hash): if not stats_only: dump(testdir) @@ -319,6 +327,7 @@ def run_test(testdir, model_name, edit_format, retries, no_test, verbose, stats_ tests_outcomes=test_outcomes, cost=coder.total_cost, duration=dur, + commit_hash=commit_hash, ) dump(results)