diff --git a/_posts/2024-04-09-gpt-4-turbo.md b/_posts/2024-04-09-gpt-4-turbo.md new file mode 100644 index 000000000..64c3ad8b8 --- /dev/null +++ b/_posts/2024-04-09-gpt-4-turbo.md @@ -0,0 +1,69 @@ +--- +title: GPT-4 Turbo with Vision is a step backwards for coding +excerpt: OpenAI's new `gpt-4-turbo-2024-04-09` model scores worse on aider's code editing benchmarks than all the previous GPT-4 models. +highlight_image: /assets/2024-03-07-claude-3.svg +--- +# GPT-4 Turbo with Vision is a step backwards for coding + +[OpenAI just released GPT-4 Turbo with Vision](https://twitter.com/OpenAIDevs/status/1777769463258988634) +and it performs worse on aider's benchmark suites than all the previous GPT-4 models. +In particular, it seems much more prone to "lazy coding" then the +GPT-4 Turbo preview models. + +## Code editing skill + +[![benchmark results](/assets/2024-04-09-gpt-4-turbo.svg)](https://aider.chat/assets/2024-04-09-gpt-4-turbo.svg) + +Aider relies on a +[code editing benchmark](https://aider.chat/docs/benchmarks.html#the-benchmark) +to quantitatively evaluate how well +an LLM can make changes to existing code. +The benchmark uses aider to try and complete +[133 Exercism Python coding exercises](https://github.com/exercism/python). + +For each exercise, the LLM gets two tries to solve each problem: + +1. On the first try, it gets initial stub code and the English description of the coding task. If the tests all pass, we are done. +2. If any tests failed, aider sends the LLM the failing test output and gives it a second try to complete the task. + +GPT-4 Turbo with Vision +scores only 62% on this benchmark, +the lowest score of any of the existing GPT-4 models. +The other models scored 63-66%, so this represents only a small +regression, and is likely statistically insignificant when compared +against `gpt-4-0613`. + +## Lazy coding + +[![benchmark results](/assets/2024-04-09-gpt-4-turbo-laziness.svg)](https://aider.chat/assets/2024-04-09-gpt-4-turbo-laziness.svg) + +The GPT-4 Turbo "preview" models have been widely criticized for being "lazy" +when coding. +They often omit needed code +and instead leave comments with homework assignments like "implement method here". + +``` +def some_complex_method(foo, bar): + # ... implement method here ... +``` + +Aider uses a ["laziness" benchmark suite](https://github.com/paul-gauthier/refactor-benchmark) +which is designed to both provoke and quantify lazy coding. +It consists of +89 python refactoring tasks +which tend to make GPT-4 Turbo code in that lazy manner. + +The new GPT-4 Turbo with Vision model scores only 33% on aider's +refactoring benchmark, making it the laziest coder of all the GPT-4 Turbo models +by a significant margin. + +# Conclusions + +Aider has full support for the new GPT-4 Turbo with Vision +model, which you can access using the switch `--model gpt-4-turbo-2024-04-09`. +But aider will continue to use `gpt-4-1106-preview` by default, +as it is by far the strongest coder of the GPT-4 models. + + + + diff --git a/assets/2024-04-09-gpt-4-turbo-laziness.svg b/assets/2024-04-09-gpt-4-turbo-laziness.svg new file mode 100644 index 000000000..8e9377752 --- /dev/null +++ b/assets/2024-04-09-gpt-4-turbo-laziness.svg @@ -0,0 +1,1521 @@ + + + + + + + + 2024-04-09T16:51:38.404046 + image/svg+xml + + + Matplotlib v3.8.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/2024-04-09-gpt-4-turbo.svg b/assets/2024-04-09-gpt-4-turbo.svg new file mode 100644 index 000000000..2b777767c --- /dev/null +++ b/assets/2024-04-09-gpt-4-turbo.svg @@ -0,0 +1,1707 @@ + + + + + + + + 2024-04-09T16:53:48.402972 + image/svg+xml + + + Matplotlib v3.8.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 229ecfef7..b09e82fe7 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -338,14 +338,9 @@ def plot_outcomes_claude(df): "#b3e6a8", "#b3e6a8", "#b3e6a8", - "#b3e6a8", "#b3d1e6", - "#b3d1e6", - "#b3d1e6", - "#e6b3b3", - "#d1b3e6", ] - hatch = [ + hatch = [ # noqa: F841 "", "", "", @@ -356,7 +351,7 @@ def plot_outcomes_claude(df): "", "////", ] - hatch = [ + hatch = [ # noqa: F841 "////", "////", "////", @@ -372,38 +367,41 @@ def plot_outcomes_claude(df): df.iloc[:, 1], width * 0.95, color=color, - hatch=hatch, - zorder=zorder, + # hatch=hatch, + # zorder=zorder, **edge, ) if zorder == 2: ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6) ax.set_xticks([p + 0.5 * width for p in pos]) - model_labels = [] - for model in df.iloc[:, 0]: - pieces = model.split("-") - N = 3 - ml = "-".join(pieces[:N]) - if pieces[N:]: - ml += "-\n" + "-".join(pieces[N:]) - model_labels.append(ml) - ax.set_xticklabels(model_labels, rotation=60) + models = df.iloc[:, 0] + model_map = { + "gpt-4-0613": "gpt-4-\n0613", + "gpt-4-0125-preview": "gpt-4-\n0125-preview", + "gpt-4-1106-preview": "gpt-4-\n1106-preview", + "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)", + } + model_labels = [] + for model in models: + ml = model_map.get(model, model) + model_labels.append(ml) + ax.set_xticklabels(model_labels, rotation=0) top = 95 ax.annotate( "First attempt,\nbased on\nnatural language\ninstructions", - xy=(2.0, 41), - xytext=(1.75, top), + xy=(1.0, 53), + xytext=(0.75, top), horizontalalignment="center", verticalalignment="top", arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, ) ax.annotate( "Second attempt,\nincluding unit test\nerror output", - xy=(2.55, 56), - xytext=(3.9, top), + xy=(1.55, 65), + xytext=(1.9, top), horizontalalignment="center", verticalalignment="top", arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, @@ -442,7 +440,7 @@ def plot_refactoring(df): for grouped in tries: zorder += 1 df = grouped.unstack() - df.sort_values(by=["model"], ascending=False, inplace=True) + # df.sort_values(by=["model"], ascending=False, inplace=True) num_models, num_formats = df.shape pos = np.array(range(num_models)) @@ -482,6 +480,12 @@ def plot_refactoring(df): if zorder == 2: edge["label"] = label + color = [ + "#b3e6a8", + "#b3e6a8", + "#b3d1e6", + ] + rects = ax.bar( pos + i * width, df[fmt], @@ -495,17 +499,28 @@ def plot_refactoring(df): if zorder == 2: ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6) - ax.set_xticks([p + 0.5 * width for p in pos]) - ax.set_xticklabels(models) + ax.set_xticks([p + 0 * width for p in pos]) + + model_map = { + "gpt-4-0125-preview": "gpt-4-\n0125-preview", + "gpt-4-1106-preview": "gpt-4-\n1106-preview", + "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)", + } + model_labels = [] + for model in models: + ml = model_map.get(model, model) + model_labels.append(ml) + + ax.set_xticklabels(model_labels, rotation=0) ax.set_ylabel("Percent of exercises completed successfully") # ax.set_xlabel("Model") - ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)') - ax.legend( - # title="Edit Format", - loc="upper left", - # bbox_to_anchor=(0.95, 0.95), - ) + ax.set_title('Refactoring "Laziness" Benchmark') + # ax.legend( + # title="Edit Format", + # loc="upper left", + # bbox_to_anchor=(0.95, 0.95), + # ) ax.set_ylim(top=100) plt.tight_layout()