added --num-tests

This commit is contained in:
Paul Gauthier 2023-06-25 08:30:14 -07:00
parent e0e680e781
commit 5eaa50fe6c

View file

@ -10,6 +10,7 @@ from json.decoder import JSONDecodeError
from pathlib import Path from pathlib import Path
import lox import lox
from rich.console import Console
from aider import models from aider import models
from aider.coders import Coder from aider.coders import Coder
@ -20,6 +21,9 @@ ORIGINAL_DNAME = Path("tmp.benchmark/practice")
assert ORIGINAL_DNAME.exists() assert ORIGINAL_DNAME.exists()
console = Console(style="green", highlight=False)
def main(): def main():
parser = argparse.ArgumentParser(description="Aider Benchmark") parser = argparse.ArgumentParser(description="Aider Benchmark")
parser.add_argument("dirname", type=str, help="Directory name") parser.add_argument("dirname", type=str, help="Directory name")
@ -57,6 +61,13 @@ def main():
help="Number of threads to run in parallel", help="Number of threads to run in parallel",
default=1, default=1,
) )
parser.add_argument(
"--num-tests",
"-n",
type=int,
help="Number of tests to run",
default=-1,
)
args = parser.parse_args() args = parser.parse_args()
@ -79,9 +90,15 @@ def main():
shutil.copytree(ORIGINAL_DNAME, dirname) shutil.copytree(ORIGINAL_DNAME, dirname)
test_dnames = sorted(os.listdir(dirname)) test_dnames = sorted(os.listdir(dirname))
total_tests = len(test_dnames)
if args.keyword: if args.keyword:
test_dnames = [dn for dn in test_dnames if args.keyword in dn] test_dnames = [dn for dn in test_dnames if args.keyword in dn]
random.shuffle(test_dnames)
if args.num_tests > 0:
test_dnames = test_dnames[: args.num_tests]
if args.threads == 1: if args.threads == 1:
all_results = [] all_results = []
for testname in test_dnames: for testname in test_dnames:
@ -95,9 +112,8 @@ def main():
) )
all_results.append(results) all_results.append(results)
summarize_results(all_results) summarize_results(all_results, total_tests)
else: else:
random.shuffle(test_dnames)
run_test_threaded = lox.thread(args.threads)(run_test) run_test_threaded = lox.thread(args.threads)(run_test)
for testname in test_dnames: for testname in test_dnames:
run_test_threaded.scatter( run_test_threaded.scatter(
@ -113,7 +129,7 @@ def main():
print() print()
print() print()
print() print()
summarize_results(all_results) summarize_results(all_results, total_tests)
def summarize_results(all_results, total_tests=None): def summarize_results(all_results, total_tests=None):
@ -140,23 +156,26 @@ def summarize_results(all_results, total_tests=None):
total_cost += results["cost"] total_cost += results["cost"]
duration += results["duration"] duration += results["duration"]
print() console.rule()
print(f"{completed_tests} test-cases")
console.print(f"{completed_tests} test-cases")
for i in range(retries): for i in range(retries):
pass_rate = 100 * passed_tests[i] / completed_tests pass_rate = 100 * passed_tests[i] / completed_tests
print(f"{pass_rate:.1f}% correct after try {i}") console.print(f"{pass_rate:.1f}% correct after try {i}")
avg_duration = duration / completed_tests avg_duration = duration / completed_tests
print(f"{avg_duration:.1f} sec/test-case") console.print(f"{avg_duration:.1f} sec/test-case")
avg_cost = total_cost / completed_tests avg_cost = total_cost / completed_tests
projected_cost = avg_cost * total_tests projected_cost = avg_cost * total_tests
print( console.print(
f"Cost: ${avg_cost:.4f} average, ${total_cost:.2f} total, ${projected_cost:.2f} projected" f"Cost: ${avg_cost:.4f} average, ${total_cost:.2f} total, ${projected_cost:.2f} projected"
) )
console.rule()
def run_test(testdir, model_name, edit_format, retries, no_test, verbose): def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
dump(testdir) dump(testdir)