pull summary data into a df

This commit is contained in:
Paul Gauthier 2023-06-29 18:48:20 -07:00
parent 6706b01eb8
commit fe0aed2420
2 changed files with 55 additions and 33 deletions

View file

@ -2,6 +2,6 @@ FROM python:3.8-slim
RUN apt-get update && apt-get install -y less git RUN apt-get update && apt-get install -y less git
COPY requirements.txt /aider/requirements.txt COPY requirements.txt /aider/requirements.txt
RUN pip install --upgrade pip && pip install -r /aider/requirements.txt RUN pip install --upgrade pip && pip install -r /aider/requirements.txt
RUN pip install lox typer RUN pip install lox typer pandas matplotlib imgcat
WORKDIR /aider WORKDIR /aider

View file

@ -11,10 +11,12 @@ import time
from collections import defaultdict from collections import defaultdict
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError
from pathlib import Path from pathlib import Path
from types import SimpleNamespace
from typing import List from typing import List
import git import git
import lox import lox
import pandas as pd
import prompts import prompts
import typer import typer
from rich.console import Console from rich.console import Console
@ -31,6 +33,19 @@ ORIGINAL_DNAME = BENCHMARK_DNAME / "exercism-python"
app = typer.Typer(add_completion=False, pretty_exceptions_enable=False) app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
def show_stats(dirnames):
rows = []
for dirname in dirnames:
row = summarize_results(dirname)
rows.append(vars(row))
df = pd.DataFrame.from_records(rows)
print(df)
df.to_csv("tmp.benchmark.csv")
def resolve_dirname(dirname, use_single_prior, make_new): def resolve_dirname(dirname, use_single_prior, make_new):
if len(dirname.parts) > 1: if len(dirname.parts) > 1:
return dirname return dirname
@ -97,9 +112,7 @@ def main(
updated_dirnames.append(dirname) updated_dirnames.append(dirname)
if stats_only: if stats_only:
for dirname in updated_dirnames: return show_stats(updated_dirnames)
summarize_results(dirname)
return
assert len(updated_dirnames) == 1, updated_dirnames assert len(updated_dirnames) == 1, updated_dirnames
dirname = updated_dirnames[0] dirname = updated_dirnames[0]
@ -176,23 +189,27 @@ def main(
def summarize_results(dirname): def summarize_results(dirname):
res = SimpleNamespace()
dirname = Path(dirname) dirname = Path(dirname)
total_tests = len(list(dirname.glob("*"))) res.total_tests = len(list(dirname.glob("*")))
all_results = [json.loads(fname.read_text()) for fname in dirname.glob("*/.aider.results.json")] all_results = [json.loads(fname.read_text()) for fname in dirname.glob("*/.aider.results.json")]
completed_tests = 0
try: try:
tries = max(len(results["tests_outcomes"]) for results in all_results if results) tries = max(len(results["tests_outcomes"]) for results in all_results if results)
except ValueError: except ValueError:
tries = 0 tries = 0
res.dir_name = str(dirname)
passed_tests = [0] * tries passed_tests = [0] * tries
duration = 0
total_cost = 0 res.completed_tests = 0
total_error_outputs = 0 res.duration = 0
total_user_asks = 0 res.cost = 0
total_test_timeouts = 0 res.error_outputs = 0
num_exhausted_context_windows = 0 res.user_asks = 0
res.test_timeouts = 0
res.exhausted_context_windows = 0
variants = defaultdict(set) variants = defaultdict(set)
@ -200,68 +217,73 @@ def summarize_results(dirname):
if not results: if not results:
continue continue
completed_tests += 1 res.completed_tests += 1
passed = results["tests_outcomes"][-1] passed = results["tests_outcomes"][-1]
if passed: if passed:
for i in range(len(results["tests_outcomes"]) - 1, tries): for i in range(len(results["tests_outcomes"]) - 1, tries):
passed_tests[i] += 1 passed_tests[i] += 1
total_cost += results["cost"] res.cost += results["cost"]
duration += results["duration"] res.duration += results["duration"]
total_test_timeouts += results.get("test_timeouts", 0) res.test_timeouts += results.get("test_timeouts", 0)
total_error_outputs += results.get("num_error_outputs", 0) res.error_outputs += results.get("num_error_outputs", 0)
total_user_asks += results.get("num_user_asks", 0) res.user_asks += results.get("num_user_asks", 0)
num_exhausted_context_windows += results.get("num_exhausted_context_windows", 0) res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0)
for key in "model edit_format commit_hash".split(): for key in "model edit_format commit_hash".split():
val = results.get(key) val = results.get(key)
variants[key].add(val) variants[key].add(val)
if not completed_tests: if not res.completed_tests:
return return
console = Console(highlight=False) console = Console(highlight=False)
console.rule(title=str(dirname)) console.rule(title=str(dirname))
console.print(f"test-cases: {completed_tests}") console.print(f"test-cases: {res.completed_tests}")
for key, val in variants.items(): for key, val in variants.items():
if len(val) > 1: if len(val) > 1:
style = "red" style = "red"
else: else:
style = None style = None
val = ", ".join(map(str, val)) val = ", ".join(map(str, val))
setattr(res, key, val)
console.print(f"{key}: {val}", style=style) console.print(f"{key}: {val}", style=style)
print("num_error_outputs:", total_error_outputs) print("num_error_outputs:", res.error_outputs)
print("num_user_asks:", total_user_asks) print("num_user_asks:", res.user_asks)
style = "red" if num_exhausted_context_windows else None style = "red" if res.exhausted_context_windows else None
console.print("num_exhausted_context_windows", num_exhausted_context_windows, style=style) console.print("num_exhausted_context_windows", res.exhausted_context_windows, style=style)
style = "red" if total_test_timeouts else None style = "red" if res.test_timeouts else None
console.print("test_timeouts:", total_test_timeouts, style=style) console.print("test_timeouts:", res.test_timeouts, style=style)
console.print() console.print()
for i in range(tries): for i in range(tries):
pass_rate = 100 * passed_tests[i] / completed_tests pass_rate = 100 * passed_tests[i] / res.completed_tests
console.print(f"{pass_rate:.1f}% correct after try {i}") console.print(f"{pass_rate:.1f}% correct after try {i}")
setattr(res, f"pass_rate_{i+1}", pass_rate)
console.print() console.print()
avg_duration = duration / completed_tests res.avg_duration = res.duration / res.completed_tests
console.print(f"duration: {avg_duration:.1f} sec/test-case") console.print(f"duration: {res.avg_duration:.1f} sec/test-case")
avg_cost = total_cost / completed_tests res.avg_cost = res.cost / res.completed_tests
projected_cost = avg_cost * total_tests projected_cost = res.avg_cost * res.total_tests
console.print( console.print(
f"costs: ${avg_cost:.4f}/test-case, ${total_cost:.2f} total," f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total,"
f" ${projected_cost:.2f} projected" f" ${projected_cost:.2f} projected"
) )
console.rule() console.rule()
# print(json.dumps(vars(res), indent=4, sort_keys=True))
return res
def run_test( def run_test(
testdir, model_name, edit_format, tries, no_unit_tests, no_aider, verbose, commit_hash testdir, model_name, edit_format, tries, no_unit_tests, no_aider, verbose, commit_hash