Allow N tries with test error output

2025-05-22 21:34:59 +00:00 · 2023-06-23 19:30:15 -07:00 · 2023-06-23 19:30:15 -07:00 · 88e39b9fdc
commit 88e39b9fdc
parent 9e506cccb0
1 changed files with 26 additions and 12 deletions
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@ -70,7 +70,7 @@ def main():
        if results:
            completed_tests += 1
-            passed = results["tests_passed"]
+            passed = results["tests_outcomes"][-1]
            if passed:
                passed_tests += 1
@ -148,20 +148,33 @@ def run_test(testdir, model_name, edit_format):
        stream=False,
    )
-    start = time.time()
+    dur = 0
-    coder.run(with_message=instructions)
+    test_outcomes = []
-    dur = time.time() - start
+    for i in range(3):
        start = time.time()
        coder.run(with_message=instructions)
        dur += time.time() - start
-    if coder.num_control_c:
+        if coder.num_control_c:
-        raise KeyboardInterrupt
+            raise KeyboardInterrupt
-    passed = run_tests(history_fname)
+        errors = run_tests(history_fname)
        if errors:
            test_outcomes.append(False)
        else:
            test_outcomes.append(True)
            break
        instructions = errors
        filelist = " ".join(fnames)
        instructions += f"\n\nFix the code in {filelist} to resolve the errors above."
    results = dict(
        testdir=str(testdir),
        model=main_model.name,
        edit_format=edit_format,
-        tests_passed=passed,
+        tests_outcomes=test_outcomes,
        cost=coder.total_cost,
        duration=dur,
    )
@ -178,7 +191,7 @@ def run_tests(history_fname):
    assert len(test_files)
    all_tests_passed = True
-
+    timeout = 60
    for test_file in test_files:
        dump(test_file)
        try:
@ -187,7 +200,7 @@ def run_tests(history_fname):
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
-                timeout=60,
+                timeout=timeout,
            )
            if result.returncode != 0:
                all_tests_passed = False
@ -197,13 +210,14 @@ def run_tests(history_fname):
        except subprocess.TimeoutExpired:
            all_tests_passed = False
-            res = f"Test {test_file} timed out"
+            res = f"Test {test_file} timed out after {timeout} seconds."
        print(res)
        with history_fname.open("a") as fh:
            fh.write(f"```\n{res}\n```")
-    return all_tests_passed
+        if not all_tests_passed:
            return res
 if __name__ == "__main__":