From 411c744a15ffed902ccff8a842b4f98719a02d87 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Sun, 2 Jul 2023 06:09:27 -0700
Subject: [PATCH] run repeats

---
 benchmark/rungrid.py | 61 ++++++++++++++++++++++++--------------------
 docs/benchmarks.md   | 15 ++++++-----
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/benchmark/rungrid.py b/benchmark/rungrid.py
index b8e5614f6..d99fe4508 100755
--- a/benchmark/rungrid.py
+++ b/benchmark/rungrid.py
@@ -9,45 +9,50 @@ from aider.dump import dump  # noqa: F401
 def main():
     models = [
         # "gpt-3.5-turbo-0301",
-        # "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-0613",
         # "gpt-3.5-turbo-16k-0613",
-        "gpt-4-0314",
-        "gpt-4-0613",
+        # "gpt-4-0314",
+        # "gpt-4-0613",
     ]
     edit_formats = [
-        "diff",
-        "diff-func",
+        # "diff",
+        # "diff-func",
         "whole",
-        "whole-func",
+        # "whole-func",
     ]
 
-    for model in models:
-        for edit_format in edit_formats:
-            # dump(model, edit_format)
+    for repeat in range(1, 10, 1):
+        for model in models:
+            for edit_format in edit_formats:
+                # dump(model, edit_format)
 
-            if "-func" in edit_format and "-03" in model:
-                continue
+                if "-func" in edit_format and "-03" in model:
+                    continue
 
-            if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"):
-                # sublist reliably hangs the API?
-                continue
+                # if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"):
+                #    # sublist reliably hangs the API?
+                #    continue
 
-            dirname = f"rungrid-{model}-{edit_format}"
+                # dirname = f"rungrid-{model}-{edit_format}"
+                dirname = f"rungrid-{model}-{edit_format}-repeat-{repeat}"
+                run(dirname, model, edit_format)
 
-            cmd = [
-                "./benchmark/benchmark.py",
-                dirname,
-                "--model",
-                model,
-                "--edit-format",
-                edit_format,
-                "--threads",
-                "10",
-                "--cont",
-            ]
-            print(" ".join(cmd))
 
-            subprocess.run(cmd, check=True)
+def run(dirname, model, edit_format):
+    cmd = [
+        "./benchmark/benchmark.py",
+        dirname,
+        "--model",
+        model,
+        "--edit-format",
+        edit_format,
+        "--threads",
+        "10",
+        "--cont",
+    ]
+    print(" ".join(cmd))
+
+    subprocess.run(cmd, check=True)
 
 
 if __name__ == "__main__":
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index a1ffe24f1..b83f31f9a 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -24,8 +24,8 @@ Aider currently uses simple text based editing formats, but
 [OpenAI's new function calling
 API](https://openai.com/blog/function-calling-and-other-api-updates)
 looks like a promising way to create more structured edit formats.
-Before making such a big change, I wanted to make
-sure I had a quantitative way to assess the benefits
+Before making such a big change, I wanted
+a quantitative way to assess the benefits
 of function based editing.
 
 With this in mind, I developed a
@@ -70,6 +70,7 @@ Using more complex output formats with GPT seems to introduce two issues:
 I was expecting to start using function call based edits in aider for both GPT-3.5 and GPT-4.
 But given these benchmark results, I won't be adopting the functions API
 at this time.
+I will certainly plan to benchmark functions again with future versions of the models.
 
 More details on the benchmark, edit formats and results are discussed below.
 
@@ -84,9 +85,9 @@ their coding skills.
 
 Each exercise includes:
 
-  - Instructions, provided in markdown files.
-  - Stub python code in an *implementation file*, specifying the functions or classes that need to be implemented.
-  - Unit tests in a separate python file.
+  - [Instructions](https://github.com/exercism/python/blob/main/exercises/practice/anagram/.docs/instructions.md), provided in markdown files.
+  - [Stub python code](https://github.com/exercism/python/blob/main/exercises/practice/anagram/anagram.py) in an *implementation file*, specifying the functions or classes that need to be implemented.
+  - [Unit tests](https://github.com/exercism/python/blob/main/exercises/practice/anagram/anagram_test.py) in a separate python file.
 
 The goal is for GPT to read the instructions, implement the provided function/class skeletons
 and pass all the unit tests. The benchmark measures what percentage of
@@ -152,7 +153,7 @@ format, resulting in the code not being saved correctly.
 
 It's worth keeping in mind that changing the edit format often affects
 both aspects of GPT's performance.
-Complex edit formats often lead to poorer code *and* make it less
+Complex edit formats often lead GPT to write worse code *and* make it less
 successful at formatting the edits correctly.
 
 
@@ -323,7 +324,7 @@ distinct responses, usually less than 5-10 variations.  This suggests
 that OpenAI may be load balancing their API across a number of
 slightly different instances of the model?
 
-For some exercises, some of these variable responses pass the unit tests while
+For certain exercises, some of these variable responses pass the unit tests while
 other variants do not. Results for exercises like this, which are
 "on the bubble",
 are therefore a bit random, depending on which variant OpenAI returns.