diff --git a/assets/benchmarks.svg b/assets/benchmarks.svg index 2dfedbda3..ee7e0b175 100644 --- a/assets/benchmarks.svg +++ b/assets/benchmarks.svg @@ -6,7 +6,7 @@ - 2023-07-01T17:07:13.416292 + 2023-07-01T17:50:36.848487 image/svg+xml @@ -32,8 +32,8 @@ z @@ -41,12 +41,12 @@ z - - + @@ -363,7 +363,7 @@ z - + @@ -428,7 +428,7 @@ z - + @@ -482,7 +482,7 @@ z - + @@ -525,7 +525,7 @@ z - + @@ -550,16 +550,16 @@ z +" clip-path="url(#pf511fcd1c4)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -571,18 +571,18 @@ L -3.5 0 - + - + - + - + - + - + @@ -636,18 +636,18 @@ L 840.24 320.23395 - + - + - + @@ -655,18 +655,18 @@ L 840.24 229.834925 - + - + - + - + - + - + @@ -735,7 +735,7 @@ L 840.24 49.036875 - + - - + + - - - - - - - - - - - - - + + + + + + + + + + + + + + +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> @@ -1229,13 +1230,13 @@ L 840.24 501.032 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/> - - + - + @@ -1300,7 +1301,7 @@ z - + @@ -1308,7 +1309,7 @@ z - + @@ -1316,7 +1317,7 @@ z - + @@ -1324,7 +1325,7 @@ z - + @@ -1332,7 +1333,7 @@ z - + @@ -1340,7 +1341,7 @@ z - + - + @@ -1388,7 +1389,7 @@ z - + @@ -1396,7 +1397,7 @@ z - + @@ -1404,7 +1405,7 @@ z - + @@ -1412,7 +1413,7 @@ z - + @@ -1420,7 +1421,7 @@ z - + @@ -1428,7 +1429,7 @@ z - + @@ -1436,7 +1437,7 @@ z - + @@ -1445,163 +1446,163 @@ z +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> +" clip-path="url(#pf511fcd1c4)" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> - - - + - + - - - + - + @@ -1792,29 +1793,70 @@ z - - + + - - - - - - - - - - - + + + + + + + + + + + + - - + + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - + @@ -1911,16 +1949,16 @@ z - +" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> - + @@ -1928,16 +1966,16 @@ z - +" style="fill: url(#h933a162d1d); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> - + @@ -1950,16 +1988,16 @@ z - +" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> - + @@ -1968,16 +2006,16 @@ z - +" style="fill: url(#hee5da66861); stroke: #ffffff; stroke-width: 3; stroke-linejoin: miter"/> - + @@ -1994,12 +2032,12 @@ z - - + + - + +" style="fill: #ffffff; stroke: #ffffff; stroke-width: 0.5; stroke-linecap: butt; stroke-linejoin: miter"/> - + +" style="fill: #ffffff; stroke: #ffffff; stroke-width: 0.5; stroke-linecap: butt; stroke-linejoin: miter"/> diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 9d885d86c..f840c3230 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -106,8 +106,8 @@ def show_stats(dirnames): for i, fmt in enumerate(formats): if zorder > 1: edge = dict( - edgecolor="#eeeeee", - linewidth=2, + edgecolor="#ffffff", + linewidth=3, ) else: edge = dict() @@ -148,16 +148,16 @@ def show_stats(dirnames): arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, ) ax.annotate( - "Second attempt,\nafter seeing\nunittest errors", + "Second attempt,\nafter seeing\nunit test errors", xy=(3.1, 68), xytext=(4.25, 80), horizontalalignment="center", arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, ) - ax.set_ylabel("Percent of exercises with\nall unittests passing") + ax.set_ylabel("Percent of exercises with\nall unit tests passing") # ax.set_xlabel("Model") - ax.set_title("Code Editing Success") + ax.set_title("GPT Code Editing") ax.legend( title="Edit Format", loc="upper left", diff --git a/docs/benchmarks.md b/docs/benchmarks.md index b90e7bf6d..ed2e25ed6 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -10,18 +10,18 @@ improvements to your code. The ability for GPT to reliably edit local source files is crucial for this functionality. -Improving the reliability of code -editing often involves modifying and experimenting with the "edit -format" used by aider. The edit format is a critical component of the -system prompt, dictating how GPT should structure code edits in its +Much of this depends on the "edit format", which is an important component of the +system prompt. +The edit format specifies how GPT should structure code edits in its responses. Aider currently uses simple text based editing formats, but [OpenAI's new function calling API](https://openai.com/blog/function-calling-and-other-api-updates) -looked like a promising way to construct a more structured editing format. +look like a promising way to create more structured edit formats. Before making such a big change, I wanted to make -sure I had a quantitative way to assess the impact on +sure I had a quantitative way to assess +how function based edit formats would affect the reliability of code editing. I developed a @@ -40,8 +40,8 @@ on almost all the ChatGPT models, using a variety of edit formats. The results were quite interesting: - Asking GPT to return an updated copy of the whole file in a standard markdown fenced code block proved to be the most reliable and effective edit format across all GPT-3.5 and GPT-4 models. The results from this `whole` edit format are shown in solid blue in the graph. - - Using the new function calling API performed worse than the above whole file method for all models. GPT-3.5 especially produced inferior code and frequently mangled this output format. This was surprising, as the functions API was introduced to enhance the reliability of structured outputs. The results from these `...-func` edit methods are shown as patterned bars in the graph (both green and blue). - - The performance of the June (`0613`) version of GPT-3.5 appears to be a bit worse than the Feb (`0301`) version. This is visible if you look at the "first coding attempt" markers on the blue bars. + - Using the new functions API performed worse than the above whole file method for all models. GPT-3.5 especially produced inferior code and frequently mangled this output format. This was surprising, as the functions API was introduced to enhance the reliability of structured outputs. The results from these `...-func` edit methods are shown as patterned bars in the graph (both green and blue). + - The performance of the new June (`0613`) version of GPT-3.5 appears to be a bit worse than the February (`0301`) version. This is visible if you look at the "first coding attempt" markers on the first three blue bars. - As expected, the GPT-4 models outperformed the GPT-3.5 models in code editing. The quantitative benchmark results align with my intuitions @@ -115,6 +115,14 @@ Many of the exercises have multiple paragraphs of instructions, and most human coders would likely fail some tests on their first try. +The bars in the graph show the percent of exercises that were completed by +each model and edit format combination. The full bar height represents +the final outcome following the first coding attempt and the second +attempt that includes the unit test error output. +Each bar also has a horizontal mark that shows +the intermediate performance after the first coding attempt, +without the benefit of second try. + It's worth noting that GPT never gets to see the source code of the unit tests during the benchmarking. It only sees the error output from failed tests. Of course, all of this code was probably part of its