diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 746ff4a49..e3bd27e24 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -7,6 +7,7 @@ import random import re import shutil import subprocess +import sys import time from collections import defaultdict from json.decoder import JSONDecodeError @@ -43,6 +44,7 @@ def main( ), make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"), no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"), + no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"), stats_only: bool = typer.Option( False, "--stats", "-s", help="Do not run tests, just collect stats on completed tests" @@ -125,6 +127,7 @@ def main( edit_format, tries, no_unit_tests, + no_aider, verbose, commit_hash, ) @@ -140,6 +143,7 @@ def main( edit_format, tries, no_unit_tests, + no_aider, verbose, commit_hash, ) @@ -234,7 +238,9 @@ def summarize_results(dirname): console.rule() -def run_test(testdir, model_name, edit_format, tries, no_unit_tests, verbose, commit_hash): +def run_test( + testdir, model_name, edit_format, tries, no_unit_tests, no_aider, verbose, commit_hash +): if not os.path.isdir(testdir): print("Not a dir:", testdir) return @@ -301,7 +307,8 @@ def run_test(testdir, model_name, edit_format, tries, no_unit_tests, verbose, co test_outcomes = [] for i in range(tries): start = time.time() - coder.run(with_message=instructions) + if not no_aider: + coder.run(with_message=instructions) dur += time.time() - start if coder.num_control_c: @@ -433,7 +440,9 @@ def check_docker(): "/bin/true", ] result = subprocess.run(command, stdout=subprocess.PIPE, text=True) - assert not result.returncode, "Can't run: " + " ".join(command) + if result.returncode: + print("Can't run: " + " ".join(command)) + sys.exit(-1) if __name__ == "__main__":