diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index e3bd27e24..6514bb3a2 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -171,6 +171,7 @@ def summarize_results(dirname):
     total_cost = 0
     total_error_outputs = 0
     total_user_asks = 0
+    total_timeouts = 0
 
     variants = defaultdict(set)
 
@@ -186,6 +187,8 @@ def summarize_results(dirname):
 
         total_cost += results["cost"]
         duration += results["duration"]
+        if results["timeout"]:
+            total_timeouts += 1
 
         total_error_outputs += results.get("num_error_outputs", 0)
         total_user_asks += results.get("num_user_asks", 0)
@@ -210,6 +213,7 @@ def summarize_results(dirname):
         console.print(f"{key}: {val}", style=style)
     print("num_error_outputs:", total_error_outputs)
     print("num_user_asks:", total_user_asks)
+    print("test_timeouts:", total_timeouts)
 
     console.print()
     for i in range(tries):
@@ -314,10 +318,16 @@ def run_test(
         if coder.num_control_c:
             raise KeyboardInterrupt
 
-        if no_unit_tests:
-            return
+        timeout = False
 
-        errors = run_unit_tests(testdir, history_fname)
+        if no_unit_tests:
+            break
+
+        try:
+            errors = run_unit_tests(testdir, history_fname)
+        except subprocess.TimeoutExpired:
+            errors = f"Tests in {testdir} timed out!"
+            timeout = True
 
         if errors:
             test_outcomes.append(False)
@@ -340,6 +350,7 @@ def run_test(
         tests_outcomes=test_outcomes,
         cost=coder.total_cost,
         duration=dur,
+        timeout=timeout,
         commit_hash=commit_hash,
         num_error_outputs=io.num_error_outputs,
         num_user_asks=io.num_user_asks,
@@ -361,7 +372,6 @@ def run_unit_tests(testdir, history_fname):
     test_files = [file for file in testdir.glob("*") if file.name.endswith("_test.py")]
     assert len(test_files)
 
-    all_tests_passed = True
     timeout = 60
     for test_file in test_files:
         dump(test_file)
@@ -381,28 +391,21 @@ def run_unit_tests(testdir, history_fname):
         ]
         print(" ".join(command))
 
-        try:
-            result = subprocess.run(
-                command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-                timeout=timeout,
-            )
-            if result.returncode != 0:
-                all_tests_passed = False
-                print(f"Test {test_file} failed")
+        result = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            timeout=timeout,
+        )
 
-            res = cleanup_test_output(result.stdout)
-
-        except subprocess.TimeoutExpired:
-            all_tests_passed = False
-            res = f"Test {test_file} timed out after {timeout} seconds."
+        res = cleanup_test_output(result.stdout)
 
         with history_fname.open("a") as fh:
             fh.write(f"```\n{res}\n```")
 
-        if not all_tests_passed:
+        if result.returncode != 0:
+            print(f"Test {test_file} failed")
             return res