From 411c744a15ffed902ccff8a842b4f98719a02d87 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Sun, 2 Jul 2023 06:09:27 -0700 Subject: [PATCH] run repeats --- benchmark/rungrid.py | 61 ++++++++++++++++++++++++-------------------- docs/benchmarks.md | 15 ++++++----- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/benchmark/rungrid.py b/benchmark/rungrid.py index b8e5614f6..d99fe4508 100755 --- a/benchmark/rungrid.py +++ b/benchmark/rungrid.py @@ -9,45 +9,50 @@ from aider.dump import dump # noqa: F401 def main(): models = [ # "gpt-3.5-turbo-0301", - # "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-0613", # "gpt-3.5-turbo-16k-0613", - "gpt-4-0314", - "gpt-4-0613", + # "gpt-4-0314", + # "gpt-4-0613", ] edit_formats = [ - "diff", - "diff-func", + # "diff", + # "diff-func", "whole", - "whole-func", + # "whole-func", ] - for model in models: - for edit_format in edit_formats: - # dump(model, edit_format) + for repeat in range(1, 10, 1): + for model in models: + for edit_format in edit_formats: + # dump(model, edit_format) - if "-func" in edit_format and "-03" in model: - continue + if "-func" in edit_format and "-03" in model: + continue - if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"): - # sublist reliably hangs the API? - continue + # if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"): + # # sublist reliably hangs the API? + # continue - dirname = f"rungrid-{model}-{edit_format}" + # dirname = f"rungrid-{model}-{edit_format}" + dirname = f"rungrid-{model}-{edit_format}-repeat-{repeat}" + run(dirname, model, edit_format) - cmd = [ - "./benchmark/benchmark.py", - dirname, - "--model", - model, - "--edit-format", - edit_format, - "--threads", - "10", - "--cont", - ] - print(" ".join(cmd)) - subprocess.run(cmd, check=True) +def run(dirname, model, edit_format): + cmd = [ + "./benchmark/benchmark.py", + dirname, + "--model", + model, + "--edit-format", + edit_format, + "--threads", + "10", + "--cont", + ] + print(" ".join(cmd)) + + subprocess.run(cmd, check=True) if __name__ == "__main__": diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a1ffe24f1..b83f31f9a 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -24,8 +24,8 @@ Aider currently uses simple text based editing formats, but [OpenAI's new function calling API](https://openai.com/blog/function-calling-and-other-api-updates) looks like a promising way to create more structured edit formats. -Before making such a big change, I wanted to make -sure I had a quantitative way to assess the benefits +Before making such a big change, I wanted +a quantitative way to assess the benefits of function based editing. With this in mind, I developed a @@ -70,6 +70,7 @@ Using more complex output formats with GPT seems to introduce two issues: I was expecting to start using function call based edits in aider for both GPT-3.5 and GPT-4. But given these benchmark results, I won't be adopting the functions API at this time. +I will certainly plan to benchmark functions again with future versions of the models. More details on the benchmark, edit formats and results are discussed below. @@ -84,9 +85,9 @@ their coding skills. Each exercise includes: - - Instructions, provided in markdown files. - - Stub python code in an *implementation file*, specifying the functions or classes that need to be implemented. - - Unit tests in a separate python file. + - [Instructions](https://github.com/exercism/python/blob/main/exercises/practice/anagram/.docs/instructions.md), provided in markdown files. + - [Stub python code](https://github.com/exercism/python/blob/main/exercises/practice/anagram/anagram.py) in an *implementation file*, specifying the functions or classes that need to be implemented. + - [Unit tests](https://github.com/exercism/python/blob/main/exercises/practice/anagram/anagram_test.py) in a separate python file. The goal is for GPT to read the instructions, implement the provided function/class skeletons and pass all the unit tests. The benchmark measures what percentage of @@ -152,7 +153,7 @@ format, resulting in the code not being saved correctly. It's worth keeping in mind that changing the edit format often affects both aspects of GPT's performance. -Complex edit formats often lead to poorer code *and* make it less +Complex edit formats often lead GPT to write worse code *and* make it less successful at formatting the edits correctly. @@ -323,7 +324,7 @@ distinct responses, usually less than 5-10 variations. This suggests that OpenAI may be load balancing their API across a number of slightly different instances of the model? -For some exercises, some of these variable responses pass the unit tests while +For certain exercises, some of these variable responses pass the unit tests while other variants do not. Results for exercises like this, which are "on the bubble", are therefore a bit random, depending on which variant OpenAI returns.