From 426819e70323fb0a8257d2579878eb8480f8a13f Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 7 Nov 2023 10:53:27 -0800 Subject: [PATCH] copy --- assets/benchmarks-1106.svg | 370 +++++++++++++++++++++---------------- benchmark/rungrid.py | 31 ++-- docs/benchmarks-1106.md | 13 +- 3 files changed, 233 insertions(+), 181 deletions(-) diff --git a/assets/benchmarks-1106.svg b/assets/benchmarks-1106.svg index cbc8b3041..cbd1f213b 100644 --- a/assets/benchmarks-1106.svg +++ b/assets/benchmarks-1106.svg @@ -6,7 +6,7 @@ - 2023-11-07T10:21:00.074181 + 2023-11-07T10:52:05.474726 image/svg+xml @@ -31,8 +31,8 @@ z @@ -41,17 +41,17 @@ z - - + - + - + - + - + @@ -384,7 +384,7 @@ z - + - + - + @@ -449,7 +449,7 @@ z - + @@ -461,12 +461,12 @@ z - + - + - + @@ -507,12 +507,12 @@ z - + - + @@ -520,7 +520,7 @@ z - + @@ -532,12 +532,12 @@ z - + - + @@ -545,7 +545,7 @@ z - + - + +L 404.21745 239.24 +" clip-path="url(#pa703530d94)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -788,12 +788,12 @@ L -3.5 0 +L 404.21745 196.530625 +" clip-path="url(#pa703530d94)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -834,12 +834,12 @@ z +L 404.21745 153.82125 +" clip-path="url(#pa703530d94)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -853,12 +853,12 @@ L 404.17201 153.82125 +L 404.21745 111.111875 +" clip-path="url(#pa703530d94)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -872,12 +872,12 @@ L 404.17201 111.111875 +L 404.21745 68.4025 +" clip-path="url(#pa703530d94)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -932,12 +932,12 @@ z +L 404.21745 25.693125 +" clip-path="url(#pa703530d94)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1197,82 +1197,91 @@ z - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - - + + - - + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - + - + - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pa703530d94)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - - + - + - + @@ -1537,7 +1579,7 @@ z - + @@ -1556,7 +1598,7 @@ z - + @@ -1572,17 +1614,17 @@ z - - - + - + - + @@ -1658,7 +1700,7 @@ z - + @@ -1673,9 +1715,9 @@ z - + - + - + @@ -1816,7 +1858,7 @@ L 56.81 49.124375 z " style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - + @@ -1833,7 +1875,7 @@ L 56.81 63.382187 z " style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - + @@ -1868,8 +1910,8 @@ z - - + + diff --git a/benchmark/rungrid.py b/benchmark/rungrid.py index d99fe4508..f2dd53dd5 100755 --- a/benchmark/rungrid.py +++ b/benchmark/rungrid.py @@ -8,34 +8,35 @@ from aider.dump import dump # noqa: F401 def main(): models = [ - # "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", # "gpt-3.5-turbo-16k-0613", + "gpt-3.5-turbo-1106", # "gpt-4-0314", # "gpt-4-0613", ] edit_formats = [ - # "diff", + "diff", # "diff-func", - "whole", + # "whole", # "whole-func", ] - for repeat in range(1, 10, 1): - for model in models: - for edit_format in edit_formats: - # dump(model, edit_format) + # for repeat in range(1, 2, 1): + for model in models: + for edit_format in edit_formats: + # dump(model, edit_format) - if "-func" in edit_format and "-03" in model: - continue + if "-func" in edit_format and "-03" in model: + continue - # if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"): - # # sublist reliably hangs the API? - # continue + # if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"): + # # sublist reliably hangs the API? + # continue - # dirname = f"rungrid-{model}-{edit_format}" - dirname = f"rungrid-{model}-{edit_format}-repeat-{repeat}" - run(dirname, model, edit_format) + dirname = f"rungrid-nov-{model}-{edit_format}" + # dirname = f"rungrid-{model}-{edit_format}-repeat-{repeat}" + run(dirname, model, edit_format) def run(dirname, model, edit_format): diff --git a/docs/benchmarks-1106.md b/docs/benchmarks-1106.md index 4b008e118..cbac5fe25 100644 --- a/docs/benchmarks-1106.md +++ b/docs/benchmarks-1106.md @@ -40,26 +40,35 @@ The benchmark gives aider two tries to complete the task: ### gpt-4-1106-preview +For now, I have only benchmarked the GPT-4 models using the `diff` edit method. +This is the edit format that aider uses by default with gpt-4. + - The new `gpt-4-1106-preview` model seems **much faster** than the earlier GPT-4 models. I won't be able to properly quantify this until the rate limits loosen up. -- **It seems better at producing correct code on the first try**. It gets ~56% of the coding exercises correct, without needing to see errors from the test suite. Previous models only get 46-47% of the exercises correct on the first try. +- **It seems better at producing correct code on the first try**. It gets ~57% of the coding exercises correct, without needing to see errors from the test suite. Previous models only get 46-47% of the exercises correct on the first try. - The new model seems to perform similar (66%) to the old models (63-64%) after being given a second chance to correct bugs by reviewing test suite error output. **These results are preliminiary.** OpenAI is enforcing very low rate limits on the new GPT-4 model. The limits are so low, that I have only been able to attempt -94 +95 out of 133 exercism problems. The problems are selected in random order, so results should be *roughly* indicative of the full benchmark. ### gpt-3.5-turbo-1106 +I benchmarked the GPT-3.5 models with both the `whole` and `diff` edit format. +None of the gpt-3.5 models seem able to effectively use the `diff` edit format, including the newest November (1106) model. + +The comments below only focus on comparing the `whole` edit format results: + - The new `gpt-3.5-turbo-1106` model is completing the benchmark **3-4X faster** than the earlier GPT-3.5 models. - The success rate after the first try of 42% is comparable to the previous June (0613) model. The new November and previous June models are both worse than the original March (0301) model's 50% result on the first try. - The new model's 56% success rate after the second try seems comparable to the original March model, and somewhat better than the June model's 50% score. + ### Updates I will update the results on this page as quickly my rate limit allows.