mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 10:14:59 +00:00
handle tasks with exceptions in the stats output
This commit is contained in:
parent
a0649ba5fa
commit
3bb237bdc1
3 changed files with 18 additions and 11 deletions
|
@ -3,4 +3,4 @@ gpt-4-turbo-2024-04-09,0,34.1,udiff,aider --gpt-4-turbo,0.27.1-dev,b75fdb9,4/9/2
|
||||||
gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24
|
gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24
|
||||||
gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24
|
gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24
|
||||||
claude-3-opus-20240229,0,67.4,diff,aider --opus,0.31.2-dev,b02320b-dirty,5/4/24
|
claude-3-opus-20240229,0,67.4,diff,aider --opus,0.31.2-dev,b02320b-dirty,5/4/24
|
||||||
gemini/gemini-1.5-pro-latest,0.0,50.6,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,3e4fca2-dirty 1b35ca2-dirty 425cb29,5/4/24
|
gemini/gemini-1.5-pro-latest,0.0,49.4,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,425cb29 1b35ca2-dirty a0649ba-dirty 3e4fca2-dirty,2024-05-04
|
||||||
|
|
|
|
@ -759,12 +759,13 @@ def load_results(dirname):
|
||||||
|
|
||||||
def summarize_results(dirname):
|
def summarize_results(dirname):
|
||||||
all_results = load_results(dirname)
|
all_results = load_results(dirname)
|
||||||
|
dump(len(all_results))
|
||||||
|
|
||||||
res = SimpleNamespace()
|
res = SimpleNamespace()
|
||||||
res.total_tests = len(list(Path(dirname).glob("*")))
|
res.total_tests = len(list(Path(dirname).glob("*")))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tries = max(len(results["tests_outcomes"]) for results in all_results if results)
|
tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
tries = 0
|
tries = 0
|
||||||
|
|
||||||
|
@ -791,13 +792,14 @@ def summarize_results(dirname):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
res.completed_tests += 1
|
res.completed_tests += 1
|
||||||
passed = results["tests_outcomes"][-1]
|
tests_outcomes = results.get("tests_outcomes", [])
|
||||||
|
passed = tests_outcomes and tests_outcomes[-1]
|
||||||
if passed:
|
if passed:
|
||||||
for i in range(len(results["tests_outcomes"]) - 1, tries):
|
for i in range(len(tests_outcomes) - 1, tries):
|
||||||
passed_tests[i] += 1
|
passed_tests[i] += 1
|
||||||
|
|
||||||
res.cost += results["cost"]
|
res.cost += results.get("cost", 0)
|
||||||
res.duration += results["duration"]
|
res.duration += results.get("duration", 0)
|
||||||
res.test_timeouts += results.get("test_timeouts", 0)
|
res.test_timeouts += results.get("test_timeouts", 0)
|
||||||
|
|
||||||
res.error_outputs += results.get("num_error_outputs", 0)
|
res.error_outputs += results.get("num_error_outputs", 0)
|
||||||
|
@ -811,7 +813,8 @@ def summarize_results(dirname):
|
||||||
|
|
||||||
for key in "model edit_format commit_hash".split():
|
for key in "model edit_format commit_hash".split():
|
||||||
val = results.get(key)
|
val = results.get(key)
|
||||||
variants[key].add(val)
|
if val:
|
||||||
|
variants[key].add(val)
|
||||||
|
|
||||||
if not res.completed_tests:
|
if not res.completed_tests:
|
||||||
return
|
return
|
||||||
|
@ -903,7 +906,7 @@ def summarize_results(dirname):
|
||||||
csv.append(dirname.name[:10])
|
csv.append(dirname.name[:10])
|
||||||
csv = ",".join(csv)
|
csv = ",".join(csv)
|
||||||
print()
|
print()
|
||||||
print("Add this to _data/leaderboard.csv:")
|
print("Add this to the files in _data:")
|
||||||
print(csv)
|
print(csv)
|
||||||
console.rule()
|
console.rule()
|
||||||
|
|
||||||
|
@ -928,15 +931,19 @@ def get_replayed_content(replay_dname, test_dname):
|
||||||
return "".join(res)
|
return "".join(res)
|
||||||
|
|
||||||
|
|
||||||
def run_test(*args, **kwargs):
|
def run_test(original_dname, testdir, *args, **kwargs):
|
||||||
try:
|
try:
|
||||||
return run_test_real(*args, **kwargs)
|
return run_test_real(original_dname, testdir, *args, **kwargs)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print("=" * 40)
|
print("=" * 40)
|
||||||
print("Test failed")
|
print("Test failed")
|
||||||
print(err)
|
print(err)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
testdir = Path(testdir)
|
||||||
|
results_fname = testdir / ".aider.results.json"
|
||||||
|
results_fname.write_text(json.dumps(dict(exception=str(err))))
|
||||||
|
|
||||||
|
|
||||||
def run_test_real(
|
def run_test_real(
|
||||||
original_dname,
|
original_dname,
|
||||||
|
|
|
@ -175,4 +175,4 @@ See the
|
||||||
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
|
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
|
||||||
for information on running aider's code editing benchmark.
|
for information on running aider's code editing benchmark.
|
||||||
Submit results by opening a PR with edits to the
|
Submit results by opening a PR with edits to the
|
||||||
[benchmark results CSV data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
|
[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue