diff --git a/_posts/2024-04-09-gpt-4-turbo.md b/_posts/2024-04-09-gpt-4-turbo.md
new file mode 100644
index 000000000..64c3ad8b8
--- /dev/null
+++ b/_posts/2024-04-09-gpt-4-turbo.md
@@ -0,0 +1,69 @@
+---
+title: GPT-4 Turbo with Vision is a step backwards for coding
+excerpt: OpenAI's new `gpt-4-turbo-2024-04-09` model scores worse on aider's code editing benchmarks than all the previous GPT-4 models.
+highlight_image: /assets/2024-03-07-claude-3.svg
+---
+# GPT-4 Turbo with Vision is a step backwards for coding
+
+[OpenAI just released GPT-4 Turbo with Vision](https://twitter.com/OpenAIDevs/status/1777769463258988634)
+and it performs worse on aider's benchmark suites than all the previous GPT-4 models.
+In particular, it seems much more prone to "lazy coding" then the
+GPT-4 Turbo preview models.
+
+## Code editing skill
+
+[](https://aider.chat/assets/2024-04-09-gpt-4-turbo.svg)
+
+Aider relies on a
+[code editing benchmark](https://aider.chat/docs/benchmarks.html#the-benchmark)
+to quantitatively evaluate how well
+an LLM can make changes to existing code.
+The benchmark uses aider to try and complete
+[133 Exercism Python coding exercises](https://github.com/exercism/python).
+
+For each exercise, the LLM gets two tries to solve each problem:
+
+1. On the first try, it gets initial stub code and the English description of the coding task. If the tests all pass, we are done.
+2. If any tests failed, aider sends the LLM the failing test output and gives it a second try to complete the task.
+
+GPT-4 Turbo with Vision
+scores only 62% on this benchmark,
+the lowest score of any of the existing GPT-4 models.
+The other models scored 63-66%, so this represents only a small
+regression, and is likely statistically insignificant when compared
+against `gpt-4-0613`.
+
+## Lazy coding
+
+[](https://aider.chat/assets/2024-04-09-gpt-4-turbo-laziness.svg)
+
+The GPT-4 Turbo "preview" models have been widely criticized for being "lazy"
+when coding.
+They often omit needed code
+and instead leave comments with homework assignments like "implement method here".
+
+```
+def some_complex_method(foo, bar):
+ # ... implement method here ...
+```
+
+Aider uses a ["laziness" benchmark suite](https://github.com/paul-gauthier/refactor-benchmark)
+which is designed to both provoke and quantify lazy coding.
+It consists of
+89 python refactoring tasks
+which tend to make GPT-4 Turbo code in that lazy manner.
+
+The new GPT-4 Turbo with Vision model scores only 33% on aider's
+refactoring benchmark, making it the laziest coder of all the GPT-4 Turbo models
+by a significant margin.
+
+# Conclusions
+
+Aider has full support for the new GPT-4 Turbo with Vision
+model, which you can access using the switch `--model gpt-4-turbo-2024-04-09`.
+But aider will continue to use `gpt-4-1106-preview` by default,
+as it is by far the strongest coder of the GPT-4 models.
+
+
+
+
diff --git a/assets/2024-04-09-gpt-4-turbo-laziness.svg b/assets/2024-04-09-gpt-4-turbo-laziness.svg
new file mode 100644
index 000000000..8e9377752
--- /dev/null
+++ b/assets/2024-04-09-gpt-4-turbo-laziness.svg
@@ -0,0 +1,1521 @@
+
+
+
diff --git a/assets/2024-04-09-gpt-4-turbo.svg b/assets/2024-04-09-gpt-4-turbo.svg
new file mode 100644
index 000000000..2b777767c
--- /dev/null
+++ b/assets/2024-04-09-gpt-4-turbo.svg
@@ -0,0 +1,1707 @@
+
+
+
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 229ecfef7..b09e82fe7 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -338,14 +338,9 @@ def plot_outcomes_claude(df):
"#b3e6a8",
"#b3e6a8",
"#b3e6a8",
- "#b3e6a8",
"#b3d1e6",
- "#b3d1e6",
- "#b3d1e6",
- "#e6b3b3",
- "#d1b3e6",
]
- hatch = [
+ hatch = [ # noqa: F841
"",
"",
"",
@@ -356,7 +351,7 @@ def plot_outcomes_claude(df):
"",
"////",
]
- hatch = [
+ hatch = [ # noqa: F841
"////",
"////",
"////",
@@ -372,38 +367,41 @@ def plot_outcomes_claude(df):
df.iloc[:, 1],
width * 0.95,
color=color,
- hatch=hatch,
- zorder=zorder,
+ # hatch=hatch,
+ # zorder=zorder,
**edge,
)
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
ax.set_xticks([p + 0.5 * width for p in pos])
- model_labels = []
- for model in df.iloc[:, 0]:
- pieces = model.split("-")
- N = 3
- ml = "-".join(pieces[:N])
- if pieces[N:]:
- ml += "-\n" + "-".join(pieces[N:])
- model_labels.append(ml)
- ax.set_xticklabels(model_labels, rotation=60)
+ models = df.iloc[:, 0]
+ model_map = {
+ "gpt-4-0613": "gpt-4-\n0613",
+ "gpt-4-0125-preview": "gpt-4-\n0125-preview",
+ "gpt-4-1106-preview": "gpt-4-\n1106-preview",
+ "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
+ }
+ model_labels = []
+ for model in models:
+ ml = model_map.get(model, model)
+ model_labels.append(ml)
+ ax.set_xticklabels(model_labels, rotation=0)
top = 95
ax.annotate(
"First attempt,\nbased on\nnatural language\ninstructions",
- xy=(2.0, 41),
- xytext=(1.75, top),
+ xy=(1.0, 53),
+ xytext=(0.75, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
)
ax.annotate(
"Second attempt,\nincluding unit test\nerror output",
- xy=(2.55, 56),
- xytext=(3.9, top),
+ xy=(1.55, 65),
+ xytext=(1.9, top),
horizontalalignment="center",
verticalalignment="top",
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
@@ -442,7 +440,7 @@ def plot_refactoring(df):
for grouped in tries:
zorder += 1
df = grouped.unstack()
- df.sort_values(by=["model"], ascending=False, inplace=True)
+ # df.sort_values(by=["model"], ascending=False, inplace=True)
num_models, num_formats = df.shape
pos = np.array(range(num_models))
@@ -482,6 +480,12 @@ def plot_refactoring(df):
if zorder == 2:
edge["label"] = label
+ color = [
+ "#b3e6a8",
+ "#b3e6a8",
+ "#b3d1e6",
+ ]
+
rects = ax.bar(
pos + i * width,
df[fmt],
@@ -495,17 +499,28 @@ def plot_refactoring(df):
if zorder == 2:
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
- ax.set_xticks([p + 0.5 * width for p in pos])
- ax.set_xticklabels(models)
+ ax.set_xticks([p + 0 * width for p in pos])
+
+ model_map = {
+ "gpt-4-0125-preview": "gpt-4-\n0125-preview",
+ "gpt-4-1106-preview": "gpt-4-\n1106-preview",
+ "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
+ }
+ model_labels = []
+ for model in models:
+ ml = model_map.get(model, model)
+ model_labels.append(ml)
+
+ ax.set_xticklabels(model_labels, rotation=0)
ax.set_ylabel("Percent of exercises completed successfully")
# ax.set_xlabel("Model")
- ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
- ax.legend(
- # title="Edit Format",
- loc="upper left",
- # bbox_to_anchor=(0.95, 0.95),
- )
+ ax.set_title('Refactoring "Laziness" Benchmark')
+ # ax.legend(
+ # title="Edit Format",
+ # loc="upper left",
+ # bbox_to_anchor=(0.95, 0.95),
+ # )
ax.set_ylim(top=100)
plt.tight_layout()