From 3bb237bdc1d0d4a68aefb168c2c26eac2bd08998 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Sun, 5 May 2024 08:24:45 -0700
Subject: [PATCH] handle tasks with exceptions in the stats output

---
 _data/refactor_leaderboard.csv |  2 +-
 benchmark/benchmark.py         | 25 ++++++++++++++++---------
 docs/leaderboards/index.md     |  2 +-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/_data/refactor_leaderboard.csv b/_data/refactor_leaderboard.csv
index a67bb7b03..6ae17acd8 100644
--- a/_data/refactor_leaderboard.csv
+++ b/_data/refactor_leaderboard.csv
@@ -3,4 +3,4 @@ gpt-4-turbo-2024-04-09,0,34.1,udiff,aider --gpt-4-turbo,0.27.1-dev,b75fdb9,4/9/2
 gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24
 gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24
 claude-3-opus-20240229,0,67.4,diff,aider --opus,0.31.2-dev,b02320b-dirty,5/4/24
-gemini/gemini-1.5-pro-latest,0.0,50.6,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,3e4fca2-dirty 1b35ca2-dirty 425cb29,5/4/24
+gemini/gemini-1.5-pro-latest,0.0,49.4,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,425cb29 1b35ca2-dirty a0649ba-dirty 3e4fca2-dirty,2024-05-04
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 389b0a46c..3ed589b09 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -759,12 +759,13 @@ def load_results(dirname):
 
 def summarize_results(dirname):
     all_results = load_results(dirname)
+    dump(len(all_results))
 
     res = SimpleNamespace()
     res.total_tests = len(list(Path(dirname).glob("*")))
 
     try:
-        tries = max(len(results["tests_outcomes"]) for results in all_results if results)
+        tries = max(len(results.get("tests_outcomes", [])) for results in all_results if results)
     except ValueError:
         tries = 0
 
@@ -791,13 +792,14 @@ def summarize_results(dirname):
             continue
 
         res.completed_tests += 1
-        passed = results["tests_outcomes"][-1]
+        tests_outcomes = results.get("tests_outcomes", [])
+        passed = tests_outcomes and tests_outcomes[-1]
         if passed:
-            for i in range(len(results["tests_outcomes"]) - 1, tries):
+            for i in range(len(tests_outcomes) - 1, tries):
                 passed_tests[i] += 1
 
-        res.cost += results["cost"]
-        res.duration += results["duration"]
+        res.cost += results.get("cost", 0)
+        res.duration += results.get("duration", 0)
         res.test_timeouts += results.get("test_timeouts", 0)
 
         res.error_outputs += results.get("num_error_outputs", 0)
@@ -811,7 +813,8 @@ def summarize_results(dirname):
 
         for key in "model edit_format commit_hash".split():
             val = results.get(key)
-            variants[key].add(val)
+            if val:
+                variants[key].add(val)
 
     if not res.completed_tests:
         return
@@ -903,7 +906,7 @@ def summarize_results(dirname):
     csv.append(dirname.name[:10])
     csv = ",".join(csv)
     print()
-    print("Add this to _data/leaderboard.csv:")
+    print("Add this to the files in _data:")
     print(csv)
     console.rule()
 
@@ -928,15 +931,19 @@ def get_replayed_content(replay_dname, test_dname):
     return "".join(res)
 
 
-def run_test(*args, **kwargs):
+def run_test(original_dname, testdir, *args, **kwargs):
     try:
-        return run_test_real(*args, **kwargs)
+        return run_test_real(original_dname, testdir, *args, **kwargs)
     except Exception as err:
         print("=" * 40)
         print("Test failed")
         print(err)
         traceback.print_exc()
 
+        testdir = Path(testdir)
+        results_fname = testdir / ".aider.results.json"
+        results_fname.write_text(json.dumps(dict(exception=str(err))))
+
 
 def run_test_real(
     original_dname,
diff --git a/docs/leaderboards/index.md b/docs/leaderboards/index.md
index 7caca7369..ad83bfaaa 100644
--- a/docs/leaderboards/index.md
+++ b/docs/leaderboards/index.md
@@ -175,4 +175,4 @@ See the
 [benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
 for information on running aider's code editing benchmark.
 Submit results by opening a PR with edits to the
-[benchmark results CSV data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
+[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).