mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-30 17:24:59 +00:00
pull summary data into a df
This commit is contained in:
parent
6706b01eb8
commit
fe0aed2420
2 changed files with 55 additions and 33 deletions
|
@ -2,6 +2,6 @@ FROM python:3.8-slim
|
||||||
RUN apt-get update && apt-get install -y less git
|
RUN apt-get update && apt-get install -y less git
|
||||||
COPY requirements.txt /aider/requirements.txt
|
COPY requirements.txt /aider/requirements.txt
|
||||||
RUN pip install --upgrade pip && pip install -r /aider/requirements.txt
|
RUN pip install --upgrade pip && pip install -r /aider/requirements.txt
|
||||||
RUN pip install lox typer
|
RUN pip install lox typer pandas matplotlib imgcat
|
||||||
WORKDIR /aider
|
WORKDIR /aider
|
||||||
|
|
||||||
|
|
|
@ -11,10 +11,12 @@ import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import git
|
import git
|
||||||
import lox
|
import lox
|
||||||
|
import pandas as pd
|
||||||
import prompts
|
import prompts
|
||||||
import typer
|
import typer
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
@ -31,6 +33,19 @@ ORIGINAL_DNAME = BENCHMARK_DNAME / "exercism-python"
|
||||||
app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
|
app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
|
||||||
|
|
||||||
|
|
||||||
|
def show_stats(dirnames):
|
||||||
|
rows = []
|
||||||
|
for dirname in dirnames:
|
||||||
|
row = summarize_results(dirname)
|
||||||
|
rows.append(vars(row))
|
||||||
|
|
||||||
|
df = pd.DataFrame.from_records(rows)
|
||||||
|
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
df.to_csv("tmp.benchmark.csv")
|
||||||
|
|
||||||
|
|
||||||
def resolve_dirname(dirname, use_single_prior, make_new):
|
def resolve_dirname(dirname, use_single_prior, make_new):
|
||||||
if len(dirname.parts) > 1:
|
if len(dirname.parts) > 1:
|
||||||
return dirname
|
return dirname
|
||||||
|
@ -97,9 +112,7 @@ def main(
|
||||||
updated_dirnames.append(dirname)
|
updated_dirnames.append(dirname)
|
||||||
|
|
||||||
if stats_only:
|
if stats_only:
|
||||||
for dirname in updated_dirnames:
|
return show_stats(updated_dirnames)
|
||||||
summarize_results(dirname)
|
|
||||||
return
|
|
||||||
|
|
||||||
assert len(updated_dirnames) == 1, updated_dirnames
|
assert len(updated_dirnames) == 1, updated_dirnames
|
||||||
dirname = updated_dirnames[0]
|
dirname = updated_dirnames[0]
|
||||||
|
@ -176,23 +189,27 @@ def main(
|
||||||
|
|
||||||
|
|
||||||
def summarize_results(dirname):
|
def summarize_results(dirname):
|
||||||
|
res = SimpleNamespace()
|
||||||
dirname = Path(dirname)
|
dirname = Path(dirname)
|
||||||
total_tests = len(list(dirname.glob("*")))
|
res.total_tests = len(list(dirname.glob("*")))
|
||||||
all_results = [json.loads(fname.read_text()) for fname in dirname.glob("*/.aider.results.json")]
|
all_results = [json.loads(fname.read_text()) for fname in dirname.glob("*/.aider.results.json")]
|
||||||
|
|
||||||
completed_tests = 0
|
|
||||||
try:
|
try:
|
||||||
tries = max(len(results["tests_outcomes"]) for results in all_results if results)
|
tries = max(len(results["tests_outcomes"]) for results in all_results if results)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
tries = 0
|
tries = 0
|
||||||
|
|
||||||
|
res.dir_name = str(dirname)
|
||||||
|
|
||||||
passed_tests = [0] * tries
|
passed_tests = [0] * tries
|
||||||
duration = 0
|
|
||||||
total_cost = 0
|
res.completed_tests = 0
|
||||||
total_error_outputs = 0
|
res.duration = 0
|
||||||
total_user_asks = 0
|
res.cost = 0
|
||||||
total_test_timeouts = 0
|
res.error_outputs = 0
|
||||||
num_exhausted_context_windows = 0
|
res.user_asks = 0
|
||||||
|
res.test_timeouts = 0
|
||||||
|
res.exhausted_context_windows = 0
|
||||||
|
|
||||||
variants = defaultdict(set)
|
variants = defaultdict(set)
|
||||||
|
|
||||||
|
@ -200,68 +217,73 @@ def summarize_results(dirname):
|
||||||
if not results:
|
if not results:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
completed_tests += 1
|
res.completed_tests += 1
|
||||||
passed = results["tests_outcomes"][-1]
|
passed = results["tests_outcomes"][-1]
|
||||||
if passed:
|
if passed:
|
||||||
for i in range(len(results["tests_outcomes"]) - 1, tries):
|
for i in range(len(results["tests_outcomes"]) - 1, tries):
|
||||||
passed_tests[i] += 1
|
passed_tests[i] += 1
|
||||||
|
|
||||||
total_cost += results["cost"]
|
res.cost += results["cost"]
|
||||||
duration += results["duration"]
|
res.duration += results["duration"]
|
||||||
total_test_timeouts += results.get("test_timeouts", 0)
|
res.test_timeouts += results.get("test_timeouts", 0)
|
||||||
|
|
||||||
total_error_outputs += results.get("num_error_outputs", 0)
|
res.error_outputs += results.get("num_error_outputs", 0)
|
||||||
total_user_asks += results.get("num_user_asks", 0)
|
res.user_asks += results.get("num_user_asks", 0)
|
||||||
num_exhausted_context_windows += results.get("num_exhausted_context_windows", 0)
|
res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0)
|
||||||
|
|
||||||
for key in "model edit_format commit_hash".split():
|
for key in "model edit_format commit_hash".split():
|
||||||
val = results.get(key)
|
val = results.get(key)
|
||||||
variants[key].add(val)
|
variants[key].add(val)
|
||||||
|
|
||||||
if not completed_tests:
|
if not res.completed_tests:
|
||||||
return
|
return
|
||||||
|
|
||||||
console = Console(highlight=False)
|
console = Console(highlight=False)
|
||||||
console.rule(title=str(dirname))
|
console.rule(title=str(dirname))
|
||||||
|
|
||||||
console.print(f"test-cases: {completed_tests}")
|
console.print(f"test-cases: {res.completed_tests}")
|
||||||
for key, val in variants.items():
|
for key, val in variants.items():
|
||||||
if len(val) > 1:
|
if len(val) > 1:
|
||||||
style = "red"
|
style = "red"
|
||||||
else:
|
else:
|
||||||
style = None
|
style = None
|
||||||
val = ", ".join(map(str, val))
|
val = ", ".join(map(str, val))
|
||||||
|
setattr(res, key, val)
|
||||||
console.print(f"{key}: {val}", style=style)
|
console.print(f"{key}: {val}", style=style)
|
||||||
print("num_error_outputs:", total_error_outputs)
|
print("num_error_outputs:", res.error_outputs)
|
||||||
print("num_user_asks:", total_user_asks)
|
print("num_user_asks:", res.user_asks)
|
||||||
|
|
||||||
style = "red" if num_exhausted_context_windows else None
|
style = "red" if res.exhausted_context_windows else None
|
||||||
console.print("num_exhausted_context_windows", num_exhausted_context_windows, style=style)
|
console.print("num_exhausted_context_windows", res.exhausted_context_windows, style=style)
|
||||||
|
|
||||||
style = "red" if total_test_timeouts else None
|
style = "red" if res.test_timeouts else None
|
||||||
console.print("test_timeouts:", total_test_timeouts, style=style)
|
console.print("test_timeouts:", res.test_timeouts, style=style)
|
||||||
|
|
||||||
console.print()
|
console.print()
|
||||||
for i in range(tries):
|
for i in range(tries):
|
||||||
pass_rate = 100 * passed_tests[i] / completed_tests
|
pass_rate = 100 * passed_tests[i] / res.completed_tests
|
||||||
console.print(f"{pass_rate:.1f}% correct after try {i}")
|
console.print(f"{pass_rate:.1f}% correct after try {i}")
|
||||||
|
setattr(res, f"pass_rate_{i+1}", pass_rate)
|
||||||
|
|
||||||
console.print()
|
console.print()
|
||||||
avg_duration = duration / completed_tests
|
res.avg_duration = res.duration / res.completed_tests
|
||||||
|
|
||||||
console.print(f"duration: {avg_duration:.1f} sec/test-case")
|
console.print(f"duration: {res.avg_duration:.1f} sec/test-case")
|
||||||
|
|
||||||
avg_cost = total_cost / completed_tests
|
res.avg_cost = res.cost / res.completed_tests
|
||||||
|
|
||||||
projected_cost = avg_cost * total_tests
|
projected_cost = res.avg_cost * res.total_tests
|
||||||
|
|
||||||
console.print(
|
console.print(
|
||||||
f"costs: ${avg_cost:.4f}/test-case, ${total_cost:.2f} total,"
|
f"costs: ${res.avg_cost:.4f}/test-case, ${res.cost:.2f} total,"
|
||||||
f" ${projected_cost:.2f} projected"
|
f" ${projected_cost:.2f} projected"
|
||||||
)
|
)
|
||||||
|
|
||||||
console.rule()
|
console.rule()
|
||||||
|
|
||||||
|
# print(json.dumps(vars(res), indent=4, sort_keys=True))
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
def run_test(
|
||||||
testdir, model_name, edit_format, tries, no_unit_tests, no_aider, verbose, commit_hash
|
testdir, model_name, edit_format, tries, no_unit_tests, no_aider, verbose, commit_hash
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue