From 59fed25cd11d7a2569e2041f44e6acaa3b31477a Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 8 Nov 2023 08:36:48 -0800 Subject: [PATCH] copy --- assets/benchmarks-1106.svg | 106 ++++++++++++++++++------------------- benchmark/benchmark.py | 5 ++ docs/benchmarks-1106.md | 20 ++++--- docs/benchmarks.md | 10 ++-- 4 files changed, 72 insertions(+), 69 deletions(-) diff --git a/assets/benchmarks-1106.svg b/assets/benchmarks-1106.svg index af440dcee..9cd4d9852 100644 --- a/assets/benchmarks-1106.svg +++ b/assets/benchmarks-1106.svg @@ -6,7 +6,7 @@ - 2023-11-07T18:16:15.748217 + 2023-11-08T08:33:48.033451 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -363,7 +363,7 @@ z - + @@ -428,7 +428,7 @@ z - + @@ -461,7 +461,7 @@ z - + @@ -507,7 +507,7 @@ z - + @@ -532,7 +532,7 @@ z - + @@ -766,16 +766,16 @@ z +" clip-path="url(#p3e57cb55dd)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -789,11 +789,11 @@ L -3.5 0 +" clip-path="url(#p3e57cb55dd)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -835,11 +835,11 @@ z +" clip-path="url(#p3e57cb55dd)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -854,11 +854,11 @@ L 404.21745 153.82125 +" clip-path="url(#p3e57cb55dd)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -873,11 +873,11 @@ L 404.21745 111.111875 +" clip-path="url(#p3e57cb55dd)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -933,11 +933,11 @@ z +" clip-path="url(#p3e57cb55dd)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1202,7 +1202,7 @@ L 86.895575 239.24 L 86.895575 192.677148 L 64.010339 192.677148 z -" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - + + - + @@ -1430,7 +1430,7 @@ L 86.895575 239.24 L 86.895575 199.09961 L 64.010339 199.09961 z -" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p3e57cb55dd)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> + diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 8979180f8..b60868011 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -121,9 +121,14 @@ def show_stats(dirnames): df.sort_values(by=["model", "edit_format"], inplace=True) dump(df) + # plot_timing(df) plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg) +def plot_timing(df): + """plot a graph showing the average duration of each (model, edit_format)""" + + def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg): tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()] if True: diff --git a/docs/benchmarks-1106.md b/docs/benchmarks-1106.md index 4ce3b4cf3..b78b3510d 100644 --- a/docs/benchmarks-1106.md +++ b/docs/benchmarks-1106.md @@ -45,22 +45,20 @@ This is the edit format that aider uses by default with gpt-4. - The new `gpt-4-1106-preview` model seems **much faster** than the earlier GPT-4 models. I won't be able to properly quantify this until the rate limits loosen up. - **It seems better at producing correct code on the first try**. It gets -~54% of the coding exercises correct, without needing to see errors from the test suite. Previous models only get 46-47% of the exercises correct on the first try. +53% of the coding exercises correct, without needing to see errors from the test suite. Previous models only get 46-47% of the exercises correct on the first try. - The new model seems to perform similar -(~63%) to the old models (63-64%) after their second chance to correct bugs by reviewing test suite error output. +(~62%) to the old models (63-64%) after their second chance to correct bugs by reviewing test suite error output. **These are preliminary results.** OpenAI is enforcing very low rate limits on the new GPT-4 model. -The rate limiting is disrupting the normal flow of the benchmarking process, -which needs to be restarted after pauses. -The benchmarking tool is capable of such restarts, but -I will trust a "clean" run much better once the rate limits are relaxed. -The results currently reflect -130 -out of the 133 Exercism problems. -The problems are selected in random order, so results should be *roughly* -indicative of the full benchmark. +The rate limiting disrupts the the benchmarking process, +requiring it to be paused and restarted frequently. +It took ~20 partial runs over ~2 days to complete all 133 Exercism problems. +The benchmarking harness is designed to stop/restart in this manner, +but results from a single "clean" run would be more trustworthy. +Once the rate limits are relaxed I will do a clean +run of the entire benchmark. ### gpt-3.5-turbo-1106 diff --git a/docs/benchmarks.md b/docs/benchmarks.md index b5cbb3a1d..43fe9a071 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -69,7 +69,7 @@ More details on the benchmark, edit formats and results are discussed below. ## The benchmark -The benchmark uses +The benchmark uses [133 practice exercises from the Exercism python repository](https://github.com/exercism/python/tree/main/exercises/practice). These exercises were designed to help individuals learn Python and hone @@ -199,7 +199,7 @@ demo.py ### whole-func -The [whole-func](https://github.com/paul-gauthier/aider/blob/main/aider/coders/wholefile_func_coder.py) +The [whole-func](https://github.com/paul-gauthier/aider/blob/main/aider/coders/wholefile_func_coder.py) format requests updated copies of whole files to be returned using the function call API. @@ -218,7 +218,7 @@ format requests updated copies of whole files to be returned using the function The [diff-func](https://github.com/paul-gauthier/aider/blob/main/aider/coders/editblock_func_coder.py) -format requests a list of +format requests a list of original/updated style edits to be returned using the function call API. ``` @@ -235,7 +235,7 @@ original/updated style edits to be returned using the function call API. ], } ] -} +} ``` ## GPT-3.5's performance @@ -307,7 +307,7 @@ The benchmark harness also logs SHA hashes of all the OpenAI API requests and replies. This makes it possible to detect randomness or nondeterminism -in the bechmarking process. +in the benchmarking process. It turns out that the OpenAI chat APIs are not deterministic, even at `temperature=0`. The same identical request will produce multiple