handle tasks with exceptions in the stats output

This commit is contained in:
Paul Gauthier 2024-05-05 08:24:45 -07:00
parent a0649ba5fa
commit 3bb237bdc1
3 changed files with 18 additions and 11 deletions

View file

@ -3,4 +3,4 @@ gpt-4-turbo-2024-04-09,0,34.1,udiff,aider --gpt-4-turbo,0.27.1-dev,b75fdb9,4/9/2
gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24
gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24
claude-3-opus-20240229,0,67.4,diff,aider --opus,0.31.2-dev,b02320b-dirty,5/4/24
gemini/gemini-1.5-pro-latest,0.0,50.6,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,3e4fca2-dirty 1b35ca2-dirty 425cb29,5/4/24
gemini/gemini-1.5-pro-latest,0.0,49.4,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,425cb29 1b35ca2-dirty a0649ba-dirty 3e4fca2-dirty,2024-05-04

1 model second first format command version commits date
3 gpt-4-0125-preview 0 43.8 udiff aider --model gpt-4-0125-preview 0.22.1-dev 0fbd702 1/25/24
4 gpt-4-1106-preview 0 57.3 udiff aider --model gpt-4-1106-preview 0.22.1-dev a75e7c8 1/25/24
5 claude-3-opus-20240229 0 67.4 diff aider --opus 0.31.2-dev b02320b-dirty 5/4/24
6 gemini/gemini-1.5-pro-latest 0.0 50.6 49.4 diff-fenced aider --model gemini/gemini-1.5-pro-latest 0.31.2-dev 3e4fca2-dirty 1b35ca2-dirty 425cb29 425cb29 1b35ca2-dirty a0649ba-dirty 3e4fca2-dirty 5/4/24 2024-05-04

View file

@ -759,12 +759,13 @@ def load_results(dirname):
def summarize_results(dirname):
all_results = load_results(dirname)
dump(len(all_results))
res = SimpleNamespace()
res.total_tests = len(list(Path(dirname).glob("*")))
try:
tries = max(len(results["tests_outcomes"]) for results in all_results if results)
tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results)
except ValueError:
tries = 0
@ -791,13 +792,14 @@ def summarize_results(dirname):
continue
res.completed_tests += 1
passed = results["tests_outcomes"][-1]
tests_outcomes = results.get("tests_outcomes", [])
passed = tests_outcomes and tests_outcomes[-1]
if passed:
for i in range(len(results["tests_outcomes"]) - 1, tries):
for i in range(len(tests_outcomes) - 1, tries):
passed_tests[i] += 1
res.cost += results["cost"]
res.duration += results["duration"]
res.cost += results.get("cost", 0)
res.duration += results.get("duration", 0)
res.test_timeouts += results.get("test_timeouts", 0)
res.error_outputs += results.get("num_error_outputs", 0)
@ -811,7 +813,8 @@ def summarize_results(dirname):
for key in "model edit_format commit_hash".split():
val = results.get(key)
variants[key].add(val)
if val:
variants[key].add(val)
if not res.completed_tests:
return
@ -903,7 +906,7 @@ def summarize_results(dirname):
csv.append(dirname.name[:10])
csv = ",".join(csv)
print()
print("Add this to _data/leaderboard.csv:")
print("Add this to the files in _data:")
print(csv)
console.rule()
@ -928,15 +931,19 @@ def get_replayed_content(replay_dname, test_dname):
return "".join(res)
def run_test(*args, **kwargs):
def run_test(original_dname, testdir, *args, **kwargs):
try:
return run_test_real(*args, **kwargs)
return run_test_real(original_dname, testdir, *args, **kwargs)
except Exception as err:
print("=" * 40)
print("Test failed")
print(err)
traceback.print_exc()
testdir = Path(testdir)
results_fname = testdir / ".aider.results.json"
results_fname.write_text(json.dumps(dict(exception=str(err))))
def run_test_real(
original_dname,

View file

@ -175,4 +175,4 @@ See the
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
for information on running aider's code editing benchmark.
Submit results by opening a PR with edits to the
[benchmark results CSV data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).