mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-22 21:34:59 +00:00
added --num-tests
This commit is contained in:
parent
e0e680e781
commit
5eaa50fe6c
1 changed files with 27 additions and 8 deletions
|
@ -10,6 +10,7 @@ from json.decoder import JSONDecodeError
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import lox
|
import lox
|
||||||
|
from rich.console import Console
|
||||||
|
|
||||||
from aider import models
|
from aider import models
|
||||||
from aider.coders import Coder
|
from aider.coders import Coder
|
||||||
|
@ -20,6 +21,9 @@ ORIGINAL_DNAME = Path("tmp.benchmark/practice")
|
||||||
assert ORIGINAL_DNAME.exists()
|
assert ORIGINAL_DNAME.exists()
|
||||||
|
|
||||||
|
|
||||||
|
console = Console(style="green", highlight=False)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Aider Benchmark")
|
parser = argparse.ArgumentParser(description="Aider Benchmark")
|
||||||
parser.add_argument("dirname", type=str, help="Directory name")
|
parser.add_argument("dirname", type=str, help="Directory name")
|
||||||
|
@ -57,6 +61,13 @@ def main():
|
||||||
help="Number of threads to run in parallel",
|
help="Number of threads to run in parallel",
|
||||||
default=1,
|
default=1,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-tests",
|
||||||
|
"-n",
|
||||||
|
type=int,
|
||||||
|
help="Number of tests to run",
|
||||||
|
default=-1,
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -79,9 +90,15 @@ def main():
|
||||||
shutil.copytree(ORIGINAL_DNAME, dirname)
|
shutil.copytree(ORIGINAL_DNAME, dirname)
|
||||||
|
|
||||||
test_dnames = sorted(os.listdir(dirname))
|
test_dnames = sorted(os.listdir(dirname))
|
||||||
|
total_tests = len(test_dnames)
|
||||||
|
|
||||||
if args.keyword:
|
if args.keyword:
|
||||||
test_dnames = [dn for dn in test_dnames if args.keyword in dn]
|
test_dnames = [dn for dn in test_dnames if args.keyword in dn]
|
||||||
|
|
||||||
|
random.shuffle(test_dnames)
|
||||||
|
if args.num_tests > 0:
|
||||||
|
test_dnames = test_dnames[: args.num_tests]
|
||||||
|
|
||||||
if args.threads == 1:
|
if args.threads == 1:
|
||||||
all_results = []
|
all_results = []
|
||||||
for testname in test_dnames:
|
for testname in test_dnames:
|
||||||
|
@ -95,9 +112,8 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
all_results.append(results)
|
all_results.append(results)
|
||||||
summarize_results(all_results)
|
summarize_results(all_results, total_tests)
|
||||||
else:
|
else:
|
||||||
random.shuffle(test_dnames)
|
|
||||||
run_test_threaded = lox.thread(args.threads)(run_test)
|
run_test_threaded = lox.thread(args.threads)(run_test)
|
||||||
for testname in test_dnames:
|
for testname in test_dnames:
|
||||||
run_test_threaded.scatter(
|
run_test_threaded.scatter(
|
||||||
|
@ -113,7 +129,7 @@ def main():
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
summarize_results(all_results)
|
summarize_results(all_results, total_tests)
|
||||||
|
|
||||||
|
|
||||||
def summarize_results(all_results, total_tests=None):
|
def summarize_results(all_results, total_tests=None):
|
||||||
|
@ -140,23 +156,26 @@ def summarize_results(all_results, total_tests=None):
|
||||||
total_cost += results["cost"]
|
total_cost += results["cost"]
|
||||||
duration += results["duration"]
|
duration += results["duration"]
|
||||||
|
|
||||||
print()
|
console.rule()
|
||||||
print(f"{completed_tests} test-cases")
|
|
||||||
|
console.print(f"{completed_tests} test-cases")
|
||||||
for i in range(retries):
|
for i in range(retries):
|
||||||
pass_rate = 100 * passed_tests[i] / completed_tests
|
pass_rate = 100 * passed_tests[i] / completed_tests
|
||||||
print(f"{pass_rate:.1f}% correct after try {i}")
|
console.print(f"{pass_rate:.1f}% correct after try {i}")
|
||||||
|
|
||||||
avg_duration = duration / completed_tests
|
avg_duration = duration / completed_tests
|
||||||
print(f"{avg_duration:.1f} sec/test-case")
|
console.print(f"{avg_duration:.1f} sec/test-case")
|
||||||
|
|
||||||
avg_cost = total_cost / completed_tests
|
avg_cost = total_cost / completed_tests
|
||||||
|
|
||||||
projected_cost = avg_cost * total_tests
|
projected_cost = avg_cost * total_tests
|
||||||
|
|
||||||
print(
|
console.print(
|
||||||
f"Cost: ${avg_cost:.4f} average, ${total_cost:.2f} total, ${projected_cost:.2f} projected"
|
f"Cost: ${avg_cost:.4f} average, ${total_cost:.2f} total, ${projected_cost:.2f} projected"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
console.rule()
|
||||||
|
|
||||||
|
|
||||||
def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
|
def run_test(testdir, model_name, edit_format, retries, no_test, verbose):
|
||||||
dump(testdir)
|
dump(testdir)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue