diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 0cedeb3e3..30cff9c55 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -25,10 +25,8 @@ from aider.io import InputOutput
 DOCKER_IMAGE = "aider-pytest"
 
 BENCHMARK_DNAME = Path("tmp.benchmark/.")
-assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir()
 
 ORIGINAL_DNAME = BENCHMARK_DNAME / "practice/."
-assert ORIGINAL_DNAME.exists() and ORIGINAL_DNAME.is_dir()
 
 app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
 
@@ -54,6 +52,9 @@ def main(
     threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"),
     num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"),
 ):
+    assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir()
+    assert ORIGINAL_DNAME.exists() and ORIGINAL_DNAME.is_dir()
+
     repo = git.Repo(search_parent_directories=True)
     commit_hash = repo.head.object.hexsha[:7]
     if repo.is_dirty():
@@ -419,6 +420,18 @@ def cleanup_test_output(output):
         output,
         flags=re.MULTILINE,
     )
+    res = re.sub(
+        r"^====*$",
+        "====",
+        res,
+        flags=re.MULTILINE,
+    )
+    res = re.sub(
+        r"^----*$",
+        "----",
+        res,
+        flags=re.MULTILINE,
+    )
     return res
 
 
diff --git a/benchmark/test_benchmark.py b/benchmark/test_benchmark.py
new file mode 100644
index 000000000..fba5aa3e3
--- /dev/null
+++ b/benchmark/test_benchmark.py
@@ -0,0 +1,47 @@
+# flake8: noqa: E501
+
+import unittest
+
+from benchmark import cleanup_test_output
+
+
+class TestCleanupTestOutput(unittest.TestCase):
+    def test_cleanup_test_output(self):
+        # Test case with timing info
+        output = "Ran 5 tests in 0.003s\nOK"
+        expected = "\nOK"
+        self.assertEqual(cleanup_test_output(output), expected)
+
+        # Test case without timing info
+        output = "OK"
+        expected = "OK"
+        self.assertEqual(cleanup_test_output(output), expected)
+
+    def test_cleanup_test_output_lines(self):
+        # Test case with timing info
+        output = """F
+======================================================================
+FAIL: test_cleanup_test_output (test_benchmark.TestCleanupTestOutput.test_cleanup_test_output)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/Users/gauthier/Projects/aider/benchmark/test_benchmark.py", line 14, in test_cleanup_test_output
+    self.assertEqual(cleanup_test_output(output), expected)
+AssertionError: 'OK' != 'OKx'
+- OK
++ OKx
+?   +
+"""
+
+        expected = """F
+====
+FAIL: test_cleanup_test_output (test_benchmark.TestCleanupTestOutput.test_cleanup_test_output)
+----
+Traceback (most recent call last):
+  File "/Users/gauthier/Projects/aider/benchmark/test_benchmark.py", line 14, in test_cleanup_test_output
+    self.assertEqual(cleanup_test_output(output), expected)
+AssertionError: 'OK' != 'OKx'
+- OK
++ OKx
+?   +
+"""
+        self.assertEqual(cleanup_test_output(output), expected)
diff --git a/benchmark/tests/test_benchmark.py b/benchmark/tests/test_benchmark.py
deleted file mode 100644
index 52b48cdd6..000000000
--- a/benchmark/tests/test_benchmark.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import unittest
-from benchmark.benchmark import cleanup_test_output
-
-class TestCleanupTestOutput(unittest.TestCase):
-    def test_cleanup_test_output(self):
-        # Test case with timing info
-        output = "Ran 5 tests in 0.003s\nOK"
-        expected = "\nOK"
-        self.assertEqual(cleanup_test_output(output), expected)
-
-        # Test case without timing info
-        output = "OK"
-        expected = "OK"
-        self.assertEqual(cleanup_test_output(output), expected)