From c43d7e9a69b804892d635b666d9e23c848613200 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 8 Aug 2023 17:40:32 -0300 Subject: [PATCH] initial --diffs implementation --- benchmark/benchmark.py | 48 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 991d2a9f1..eba05ff51 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -253,6 +253,7 @@ def main( stats_only: bool = typer.Option( False, "--stats", "-s", help="Do not run tests, just collect stats on completed tests" ), + diffs_only: bool = typer.Option(False, "--diffs", help="Just diff the provided stats dirs"), tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"), threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"), num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"), @@ -262,8 +263,8 @@ def main( if repo.is_dirty(): commit_hash += "-dirty" - if len(dirnames) > 1 and not stats_only: - print("Only provide 1 dirname unless running with --stats") + if len(dirnames) > 1 and not (stats_only or diffs_only): + print("Only provide 1 dirname unless running with --stats or --diffs") return 1 updated_dirnames = [] @@ -277,6 +278,9 @@ def main( if stats_only: return show_stats(updated_dirnames) + if diffs_only: + return show_diffs(updated_dirnames) + assert len(updated_dirnames) == 1, updated_dirnames dirname = updated_dirnames[0] @@ -353,11 +357,45 @@ def main( return 0 -def summarize_results(dirname): - res = SimpleNamespace() +def show_diffs(dirnames): + dirnames = sorted(dirnames) + + all_results = dict((dirname, load_results(dirname)) for dirname in dirnames) + testcases = set() + for results in all_results.values(): + testcases.update(result["testcase"] for result in results) + + testcases = sorted(testcases) + + for testcase in testcases: + all_outcomes = [] + for dirname in dirnames: + results = all_results[dirname] + result = [r for r in results if r["testcase"] == testcase][0] + + outcomes = tuple(result["tests_outcomes"]) + all_outcomes.append(outcomes) + + if len(set(all_outcomes)) == 1: + continue + + print() + print(testcase) + for outcome, dirname in zip(all_outcomes, dirnames): + print(outcome, f"{dirname}/{testcase}/.aider.chat.history.md") + + +def load_results(dirname): dirname = Path(dirname) - res.total_tests = len(list(dirname.glob("*"))) all_results = [json.loads(fname.read_text()) for fname in dirname.glob("*/.aider.results.json")] + return all_results + + +def summarize_results(dirname): + all_results = load_results(dirname) + + res = SimpleNamespace() + res.total_tests = len(list(Path(dirname).glob("*"))) try: tries = max(len(results["tests_outcomes"]) for results in all_results if results)