From 5eaa50fe6c65bc07a9e16244eded5603b0466d77 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Sun, 25 Jun 2023 08:30:14 -0700
Subject: [PATCH] added --num-tests

---
 scripts/benchmark.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 15d08e9ef..04350594b 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -10,6 +10,7 @@ from json.decoder import JSONDecodeError
 from pathlib import Path
 
 import lox
+from rich.console import Console
 
 from aider import models
 from aider.coders import Coder
@@ -20,6 +21,9 @@ ORIGINAL_DNAME = Path("tmp.benchmark/practice")
 assert ORIGINAL_DNAME.exists()
 
 
+console = Console(style="green", highlight=False)
+
+
 def main():
     parser = argparse.ArgumentParser(description="Aider Benchmark")
     parser.add_argument("dirname", type=str, help="Directory name")
@@ -57,6 +61,13 @@ def main():
         help="Number of threads to run in parallel",
         default=1,
     )
+    parser.add_argument(
+        "--num-tests",
+        "-n",
+        type=int,
+        help="Number of tests to run",
+        default=-1,
+    )
 
     args = parser.parse_args()
 
@@ -79,9 +90,15 @@ def main():
         shutil.copytree(ORIGINAL_DNAME, dirname)
 
     test_dnames = sorted(os.listdir(dirname))
+    total_tests = len(test_dnames)
+
     if args.keyword:
         test_dnames = [dn for dn in test_dnames if args.keyword in dn]
 
+    random.shuffle(test_dnames)
+    if args.num_tests > 0:
+        test_dnames = test_dnames[: args.num_tests]
+
     if args.threads == 1:
         all_results = []
         for testname in test_dnames:
@@ -95,9 +112,8 @@ def main():
             )
 
             all_results.append(results)
-            summarize_results(all_results)
+            summarize_results(all_results, total_tests)
     else:
-        random.shuffle(test_dnames)
         run_test_threaded = lox.thread(args.threads)(run_test)
         for testname in test_dnames:
             run_test_threaded.scatter(
@@ -113,7 +129,7 @@ def main():
     print()
     print()
     print()
-    summarize_results(all_results)
+    summarize_results(all_results, total_tests)
 
 
 def summarize_results(all_results, total_tests=None):
@@ -140,23 +156,26 @@ def summarize_results(all_results, total_tests=None):
         total_cost += results["cost"]
         duration += results["duration"]
 
-    print()
-    print(f"{completed_tests} test-cases")
+    console.rule()
+
+    console.print(f"{completed_tests} test-cases")
     for i in range(retries):
         pass_rate = 100 * passed_tests[i] / completed_tests
-        print(f"{pass_rate:.1f}% correct after try {i}")
+        console.print(f"{pass_rate:.1f}% correct after try {i}")
 
     avg_duration = duration / completed_tests
-    print(f"{avg_duration:.1f} sec/test-case")
+    console.print(f"{avg_duration:.1f} sec/test-case")
 
     avg_cost = total_cost / completed_tests
 
     projected_cost = avg_cost * total_tests
 
-    print(
+    console.print(
         f"Cost: ${avg_cost:.4f} average, ${total_cost:.2f} total, ${projected_cost:.2f} projected"
     )
 
+    console.rule()
+
 
 def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
     dump(testdir)