diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 30cff9c55..509958030 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -22,8 +22,6 @@ from aider.coders import Coder
 from aider.dump import dump  # noqa: F401
 from aider.io import InputOutput
 
-DOCKER_IMAGE = "aider-pytest"
-
 BENCHMARK_DNAME = Path("tmp.benchmark/.")
 
 ORIGINAL_DNAME = BENCHMARK_DNAME / "practice/."
@@ -88,8 +86,6 @@ def main(
         summarize_results(dirname)
         return
 
-    build_docker()
-
     if clean and dirname.exists():
         print("Cleaning up and replacing", dirname)
         dir_files = set(fn.name for fn in dirname.glob("*"))
@@ -380,10 +376,10 @@ def run_unit_tests(testdir, history_fname):
             "--interactive=false",
             "-v",
             f"{test_file.parent.absolute()}:/app",
-            DOCKER_IMAGE,
+            "python:3.8-slim",
             "bash",
             "-c",
-            f"python -m unittest {test_file.name}",
+            f"cd /app && python -m unittest {test_file.name}",
         ]
         print(" ".join(command))
 
@@ -413,6 +409,7 @@ def run_unit_tests(testdir, history_fname):
 
 
 def cleanup_test_output(output):
+    dump(output)
     # remove timing info, to avoid randomizing the response to GPT
     res = re.sub(
         r"^Ran \d+ tests in \d+\.\d+s$",
@@ -435,44 +432,5 @@ def cleanup_test_output(output):
     return res
 
 
-def build_docker():
-    check_command = ["docker", "images", "-q", DOCKER_IMAGE]
-    check_result = subprocess.run(check_command, stdout=subprocess.PIPE, text=True)
-    if check_result.stdout.strip():
-        print(f"Docker image '{DOCKER_IMAGE}' already exists, skipping build.")
-        return
-
-    command = [
-        "docker",
-        "build",
-        "--quiet",
-        "-t",
-        DOCKER_IMAGE,
-        "-f",
-        "benchmark/Dockerfile",
-        "/dev/null",
-    ]
-    print(" ".join(command))
-
-    try:
-        result = subprocess.run(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-        )
-        res = result.stdout
-        print(res)
-
-    except subprocess.CalledProcessError as e:
-        res = f"Failed to build Docker image: {e.output}"
-        raise e
-
-    if result.returncode != 0:
-        raise Exception("Unable to build docker image")
-
-    return res
-
-
 if __name__ == "__main__":
     app()