diff --git a/assets/benchmarks-1106.svg b/assets/benchmarks-1106.svg index b3358b8a0..0f7cd893a 100644 --- a/assets/benchmarks-1106.svg +++ b/assets/benchmarks-1106.svg @@ -6,7 +6,7 @@ - 2023-11-06T18:21:59.967861 + 2023-11-06T18:47:53.342618 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -363,7 +363,7 @@ z - + @@ -428,7 +428,7 @@ z - + @@ -461,7 +461,7 @@ z - + @@ -507,7 +507,7 @@ z - + @@ -532,7 +532,7 @@ z - + @@ -766,16 +766,16 @@ z +" clip-path="url(#pacb09cb929)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -789,11 +789,11 @@ L -3.5 0 +" clip-path="url(#pacb09cb929)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -835,11 +835,11 @@ z +" clip-path="url(#pacb09cb929)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -854,11 +854,11 @@ L 404.17201 153.82125 +" clip-path="url(#pacb09cb929)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -873,11 +873,11 @@ L 404.17201 111.111875 +" clip-path="url(#pacb09cb929)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -933,11 +933,11 @@ z +" clip-path="url(#pacb09cb929)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1199,17 +1199,17 @@ z +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - + + + + + - + @@ -1377,34 +1393,34 @@ z - + - - + + - + +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#pacb09cb929)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - @@ -1635,19 +1651,21 @@ z - - - - - - - - - - + + + + + + + + + + + + - + @@ -1861,7 +1879,7 @@ z - + diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index d445ac59f..2e6bf31ea 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -196,8 +196,8 @@ def show_stats(dirnames): arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, ) ax.annotate( - "Second attempt,\nbased on\nunit test errors", - xy=(2.55, 58), + "Second attempt,\nincludingn\nunit test errors", + xy=(2.55, 56), xytext=(3.5, top), horizontalalignment="center", verticalalignment="top", diff --git a/docs/benchmarks-1106.md b/docs/benchmarks-1106.md index 5d57a5caf..a45794a78 100644 --- a/docs/benchmarks-1106.md +++ b/docs/benchmarks-1106.md @@ -48,7 +48,8 @@ indicative of the full benchmark. ## gpt-3.5-turbo-1106 - The new `gpt-3.5-turbo-1106` model is completing the benchmark **3-4X faster** than the earlier GPT-3.5 models. -- The overall success rate after the first and second tries seems comparable or a bit better than the earlier models. +- The success rate after the first try of 42% is comparable to the previous June (0613) model. They are both worse than the original March (0301) model's 50% result on the first try. +- The 56% success rate after the second try seems comparable to the original March model, and somewhat better than the June model's 50% score. ## Updates