From 3bb237bdc1d0d4a68aefb168c2c26eac2bd08998 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Sun, 5 May 2024 08:24:45 -0700 Subject: [PATCH] handle tasks with exceptions in the stats output --- _data/refactor_leaderboard.csv | 2 +- benchmark/benchmark.py | 25 ++++++++++++++++--------- docs/leaderboards/index.md | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/_data/refactor_leaderboard.csv b/_data/refactor_leaderboard.csv index a67bb7b03..6ae17acd8 100644 --- a/_data/refactor_leaderboard.csv +++ b/_data/refactor_leaderboard.csv @@ -3,4 +3,4 @@ gpt-4-turbo-2024-04-09,0,34.1,udiff,aider --gpt-4-turbo,0.27.1-dev,b75fdb9,4/9/2 gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24 gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24 claude-3-opus-20240229,0,67.4,diff,aider --opus,0.31.2-dev,b02320b-dirty,5/4/24 -gemini/gemini-1.5-pro-latest,0.0,50.6,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,3e4fca2-dirty 1b35ca2-dirty 425cb29,5/4/24 +gemini/gemini-1.5-pro-latest,0.0,49.4,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,425cb29 1b35ca2-dirty a0649ba-dirty 3e4fca2-dirty,2024-05-04 diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 389b0a46c..3ed589b09 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -759,12 +759,13 @@ def load_results(dirname): def summarize_results(dirname): all_results = load_results(dirname) + dump(len(all_results)) res = SimpleNamespace() res.total_tests = len(list(Path(dirname).glob("*"))) try: - tries = max(len(results["tests_outcomes"]) for results in all_results if results) + tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results) except ValueError: tries = 0 @@ -791,13 +792,14 @@ def summarize_results(dirname): continue res.completed_tests += 1 - passed = results["tests_outcomes"][-1] + tests_outcomes = results.get("tests_outcomes", []) + passed = tests_outcomes and tests_outcomes[-1] if passed: - for i in range(len(results["tests_outcomes"]) - 1, tries): + for i in range(len(tests_outcomes) - 1, tries): passed_tests[i] += 1 - res.cost += results["cost"] - res.duration += results["duration"] + res.cost += results.get("cost", 0) + res.duration += results.get("duration", 0) res.test_timeouts += results.get("test_timeouts", 0) res.error_outputs += results.get("num_error_outputs", 0) @@ -811,7 +813,8 @@ def summarize_results(dirname): for key in "model edit_format commit_hash".split(): val = results.get(key) - variants[key].add(val) + if val: + variants[key].add(val) if not res.completed_tests: return @@ -903,7 +906,7 @@ def summarize_results(dirname): csv.append(dirname.name[:10]) csv = ",".join(csv) print() - print("Add this to _data/leaderboard.csv:") + print("Add this to the files in _data:") print(csv) console.rule() @@ -928,15 +931,19 @@ def get_replayed_content(replay_dname, test_dname): return "".join(res) -def run_test(*args, **kwargs): +def run_test(original_dname, testdir, *args, **kwargs): try: - return run_test_real(*args, **kwargs) + return run_test_real(original_dname, testdir, *args, **kwargs) except Exception as err: print("=" * 40) print("Test failed") print(err) traceback.print_exc() + testdir = Path(testdir) + results_fname = testdir / ".aider.results.json" + results_fname.write_text(json.dumps(dict(exception=str(err)))) + def run_test_real( original_dname, diff --git a/docs/leaderboards/index.md b/docs/leaderboards/index.md index 7caca7369..ad83bfaaa 100644 --- a/docs/leaderboards/index.md +++ b/docs/leaderboards/index.md @@ -175,4 +175,4 @@ See the [benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md) for information on running aider's code editing benchmark. Submit results by opening a PR with edits to the -[benchmark results CSV data files](https://github.com/paul-gauthier/aider/blob/main/_data/). +[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).