From c86a957cf5b783ea51fe588b1b50faa3780146ad Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 7 Nov 2023 18:18:02 -0800 Subject: [PATCH] copy --- assets/benchmarks-1106.svg | 106 ++++++++++++++++++------------------- docs/benchmarks-1106.md | 14 +++-- 2 files changed, 62 insertions(+), 58 deletions(-) diff --git a/assets/benchmarks-1106.svg b/assets/benchmarks-1106.svg index fb4278f78..af440dcee 100644 --- a/assets/benchmarks-1106.svg +++ b/assets/benchmarks-1106.svg @@ -6,7 +6,7 @@ - 2023-11-07T13:57:52.178577 + 2023-11-07T18:16:15.748217 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -363,7 +363,7 @@ z - + @@ -428,7 +428,7 @@ z - + @@ -461,7 +461,7 @@ z - + @@ -507,7 +507,7 @@ z - + @@ -532,7 +532,7 @@ z - + @@ -766,16 +766,16 @@ z +" clip-path="url(#p69a8ae8466)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -789,11 +789,11 @@ L -3.5 0 +" clip-path="url(#p69a8ae8466)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -835,11 +835,11 @@ z +" clip-path="url(#p69a8ae8466)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -854,11 +854,11 @@ L 404.21745 153.82125 +" clip-path="url(#p69a8ae8466)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -873,11 +873,11 @@ L 404.21745 111.111875 +" clip-path="url(#p69a8ae8466)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -933,11 +933,11 @@ z +" clip-path="url(#p69a8ae8466)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1202,7 +1202,7 @@ L 86.895575 239.24 L 86.895575 192.677148 L 64.010339 192.677148 z -" clip-path="url(#p538038ff23)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - + + - + @@ -1430,7 +1430,7 @@ L 86.895575 239.24 L 86.895575 199.09961 L 64.010339 199.09961 z -" clip-path="url(#p538038ff23)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p69a8ae8466)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> + diff --git a/docs/benchmarks-1106.md b/docs/benchmarks-1106.md index 1906781a0..4ce3b4cf3 100644 --- a/docs/benchmarks-1106.md +++ b/docs/benchmarks-1106.md @@ -45,15 +45,19 @@ This is the edit format that aider uses by default with gpt-4. - The new `gpt-4-1106-preview` model seems **much faster** than the earlier GPT-4 models. I won't be able to properly quantify this until the rate limits loosen up. - **It seems better at producing correct code on the first try**. It gets -~57% of the coding exercises correct, without needing to see errors from the test suite. Previous models only get 46-47% of the exercises correct on the first try. +~54% of the coding exercises correct, without needing to see errors from the test suite. Previous models only get 46-47% of the exercises correct on the first try. - The new model seems to perform similar -(~66%) to the old models (63-64%) after being given a second chance to correct bugs by reviewing test suite error output. +(~63%) to the old models (63-64%) after their second chance to correct bugs by reviewing test suite error output. **These are preliminary results.** OpenAI is enforcing very low -rate limits on the new GPT-4 model. The limits are so low, that -I have only been able to attempt -113 +rate limits on the new GPT-4 model. +The rate limiting is disrupting the normal flow of the benchmarking process, +which needs to be restarted after pauses. +The benchmarking tool is capable of such restarts, but +I will trust a "clean" run much better once the rate limits are relaxed. +The results currently reflect +130 out of the 133 Exercism problems. The problems are selected in random order, so results should be *roughly* indicative of the full benchmark.