diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index b4147a9cb..dbf62aacc 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -747,7 +747,6 @@ def run_test_real( indentation_errors += sum(1 for line in errors if line.startswith("IndentationError")) print(errors[-1]) - errors = errors[:50] errors = "\n".join(errors) instructions = errors instructions += prompts.test_failures.format(file_list=file_list) @@ -788,33 +787,26 @@ def run_test_real( def run_unit_tests(testdir, history_fname): - command = [ - "python", - "-m", - "unittest", - "discover", - "-s", - str(testdir), - "-t", - str(testdir), - "-p", - "*_test.py", - ] - print(" ".join(command)) timeout = 60 + command = ["pytest"] + + print(" ".join(command)) + result = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout, + cwd=testdir, ) success = result.returncode == 0 res = result.stdout res = cleanup_test_output(res, testdir) + dump(res) with history_fname.open("a") as fh: fh.write(f"```\n{res}\n```") @@ -827,23 +819,7 @@ def run_unit_tests(testdir, history_fname): def cleanup_test_output(output, testdir): # remove timing info, to avoid randomizing the response to GPT res = re.sub( - r"^Ran \d+ tests in \d+\.\d+s$", - "", - output, - flags=re.MULTILINE, - ) - res = re.sub( - r"^====*$", - "====", - res, - flags=re.MULTILINE, - ) - res = re.sub( - r"^----*$", - "----", - res, - flags=re.MULTILINE, - ) + r"\bin \d+\.\d+s\b", res = res.replace(str(testdir), str(testdir.name)) return res