diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 180a43228..3c7184457 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -68,8 +68,6 @@ def main():
     if not dirname.exists():
         shutil.copytree(ORIGINAL_DNAME, dirname)
 
-    cwd = os.getcwd()
-
     test_dnames = sorted(os.listdir(dirname))
 
     all_results = []
@@ -86,7 +84,6 @@ def main():
             args.no_test,
             args.verbose,
         )
-        os.chdir(cwd)
 
         all_results.append(results)
         summarize_results(all_results)
@@ -147,32 +144,25 @@ def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
         print("Not a dir:", testdir)
         return
 
-    os.chdir(testdir)
+    testdir = Path(testdir)
 
-    history_fname = Path(".aider.chat.history.md")
+    history_fname = testdir / ".aider.chat.history.md"
 
-    results_fname = Path(".aider.results.json")
+    results_fname = testdir / ".aider.results.json"
     if results_fname.exists():
         try:
             return json.loads(results_fname.read_text())
         except JSONDecodeError:
-            print(f"{testdir}/{results_fname} failed to parse, skipping")
+            print(f"{results_fname} failed to parse, skipping")
             return
 
-    started_fname = Path(".aider.started")
-    if started_fname.exists():
-        # print(f"{testdir}/{started_fname} exists, skipping")
-        # return
-        pass
-    started_fname.touch()
-
     fnames = []
-    for fname in os.listdir("."):
-        if "test" not in fname and os.path.isfile(fname) and fname[0] != ".":
+    for fname in testdir.glob("*"):
+        if "test" not in fname.name and fname.is_file() and fname.name[0] != ".":
             fnames.append(fname)
 
-    file_list = " ".join(fnames)
-    instructions = Path(".docs/instructions.md").read_text()
+    file_list = " ".join(fname.name for fname in fnames)
+    instructions = (testdir / ".docs/instructions.md").read_text()
     instructions += (
         "\n\n=====\n\nModify these files according to the above instructions. Only use standard"
         " python libraries, don't suggest installing any packages.\n"
@@ -190,6 +180,7 @@ def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
 
     dump(main_model)
     dump(edit_format)
+    dump(fnames)
 
     coder = Coder.create(
         main_model,
@@ -216,7 +207,7 @@ def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
         if no_test:
             return
 
-        errors = run_tests(history_fname)
+        errors = run_pytests(testdir, history_fname)
 
         if errors:
             test_outcomes.append(False)
@@ -245,13 +236,12 @@ def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
     dump(results)
 
     results_fname.write_text(json.dumps(results, indent=4))
-    started_fname.unlink()
 
     return results
 
 
-def run_tests(history_fname):
-    test_files = [file for file in os.listdir() if file.endswith("_test.py")]
+def run_pytests(testdir, history_fname):
+    test_files = [file for file in testdir.glob("*") if file.name.endswith("_test.py")]
     assert len(test_files)
 
     all_tests_passed = True