mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-03 03:05:00 +00:00
Added gpt-4-turbo vision blog post
This commit is contained in:
parent
b117c1580c
commit
00f1cdb561
4 changed files with 3343 additions and 31 deletions
69
_posts/2024-04-09-gpt-4-turbo.md
Normal file
69
_posts/2024-04-09-gpt-4-turbo.md
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
---
|
||||||
|
title: GPT-4 Turbo with Vision is a step backwards for coding
|
||||||
|
excerpt: OpenAI's new `gpt-4-turbo-2024-04-09` model scores worse on aider's code editing benchmarks than all the previous GPT-4 models.
|
||||||
|
highlight_image: /assets/2024-03-07-claude-3.svg
|
||||||
|
---
|
||||||
|
# GPT-4 Turbo with Vision is a step backwards for coding
|
||||||
|
|
||||||
|
[OpenAI just released GPT-4 Turbo with Vision](https://twitter.com/OpenAIDevs/status/1777769463258988634)
|
||||||
|
and it performs worse on aider's benchmark suites than all the previous GPT-4 models.
|
||||||
|
In particular, it seems much more prone to "lazy coding" then the
|
||||||
|
GPT-4 Turbo preview models.
|
||||||
|
|
||||||
|
## Code editing skill
|
||||||
|
|
||||||
|
[](https://aider.chat/assets/2024-04-09-gpt-4-turbo.svg)
|
||||||
|
|
||||||
|
Aider relies on a
|
||||||
|
[code editing benchmark](https://aider.chat/docs/benchmarks.html#the-benchmark)
|
||||||
|
to quantitatively evaluate how well
|
||||||
|
an LLM can make changes to existing code.
|
||||||
|
The benchmark uses aider to try and complete
|
||||||
|
[133 Exercism Python coding exercises](https://github.com/exercism/python).
|
||||||
|
|
||||||
|
For each exercise, the LLM gets two tries to solve each problem:
|
||||||
|
|
||||||
|
1. On the first try, it gets initial stub code and the English description of the coding task. If the tests all pass, we are done.
|
||||||
|
2. If any tests failed, aider sends the LLM the failing test output and gives it a second try to complete the task.
|
||||||
|
|
||||||
|
GPT-4 Turbo with Vision
|
||||||
|
scores only 62% on this benchmark,
|
||||||
|
the lowest score of any of the existing GPT-4 models.
|
||||||
|
The other models scored 63-66%, so this represents only a small
|
||||||
|
regression, and is likely statistically insignificant when compared
|
||||||
|
against `gpt-4-0613`.
|
||||||
|
|
||||||
|
## Lazy coding
|
||||||
|
|
||||||
|
[](https://aider.chat/assets/2024-04-09-gpt-4-turbo-laziness.svg)
|
||||||
|
|
||||||
|
The GPT-4 Turbo "preview" models have been widely criticized for being "lazy"
|
||||||
|
when coding.
|
||||||
|
They often omit needed code
|
||||||
|
and instead leave comments with homework assignments like "implement method here".
|
||||||
|
|
||||||
|
```
|
||||||
|
def some_complex_method(foo, bar):
|
||||||
|
# ... implement method here ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Aider uses a ["laziness" benchmark suite](https://github.com/paul-gauthier/refactor-benchmark)
|
||||||
|
which is designed to both provoke and quantify lazy coding.
|
||||||
|
It consists of
|
||||||
|
89 python refactoring tasks
|
||||||
|
which tend to make GPT-4 Turbo code in that lazy manner.
|
||||||
|
|
||||||
|
The new GPT-4 Turbo with Vision model scores only 33% on aider's
|
||||||
|
refactoring benchmark, making it the laziest coder of all the GPT-4 Turbo models
|
||||||
|
by a significant margin.
|
||||||
|
|
||||||
|
# Conclusions
|
||||||
|
|
||||||
|
Aider has full support for the new GPT-4 Turbo with Vision
|
||||||
|
model, which you can access using the switch `--model gpt-4-turbo-2024-04-09`.
|
||||||
|
But aider will continue to use `gpt-4-1106-preview` by default,
|
||||||
|
as it is by far the strongest coder of the GPT-4 models.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
1521
assets/2024-04-09-gpt-4-turbo-laziness.svg
Normal file
1521
assets/2024-04-09-gpt-4-turbo-laziness.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 37 KiB |
1707
assets/2024-04-09-gpt-4-turbo.svg
Normal file
1707
assets/2024-04-09-gpt-4-turbo.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 45 KiB |
|
@ -338,14 +338,9 @@ def plot_outcomes_claude(df):
|
||||||
"#b3e6a8",
|
"#b3e6a8",
|
||||||
"#b3e6a8",
|
"#b3e6a8",
|
||||||
"#b3e6a8",
|
"#b3e6a8",
|
||||||
"#b3e6a8",
|
|
||||||
"#b3d1e6",
|
"#b3d1e6",
|
||||||
"#b3d1e6",
|
|
||||||
"#b3d1e6",
|
|
||||||
"#e6b3b3",
|
|
||||||
"#d1b3e6",
|
|
||||||
]
|
]
|
||||||
hatch = [
|
hatch = [ # noqa: F841
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
|
@ -356,7 +351,7 @@ def plot_outcomes_claude(df):
|
||||||
"",
|
"",
|
||||||
"////",
|
"////",
|
||||||
]
|
]
|
||||||
hatch = [
|
hatch = [ # noqa: F841
|
||||||
"////",
|
"////",
|
||||||
"////",
|
"////",
|
||||||
"////",
|
"////",
|
||||||
|
@ -372,38 +367,41 @@ def plot_outcomes_claude(df):
|
||||||
df.iloc[:, 1],
|
df.iloc[:, 1],
|
||||||
width * 0.95,
|
width * 0.95,
|
||||||
color=color,
|
color=color,
|
||||||
hatch=hatch,
|
# hatch=hatch,
|
||||||
zorder=zorder,
|
# zorder=zorder,
|
||||||
**edge,
|
**edge,
|
||||||
)
|
)
|
||||||
if zorder == 2:
|
if zorder == 2:
|
||||||
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
|
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)
|
||||||
|
|
||||||
ax.set_xticks([p + 0.5 * width for p in pos])
|
ax.set_xticks([p + 0.5 * width for p in pos])
|
||||||
model_labels = []
|
|
||||||
for model in df.iloc[:, 0]:
|
|
||||||
pieces = model.split("-")
|
|
||||||
N = 3
|
|
||||||
ml = "-".join(pieces[:N])
|
|
||||||
if pieces[N:]:
|
|
||||||
ml += "-\n" + "-".join(pieces[N:])
|
|
||||||
model_labels.append(ml)
|
|
||||||
|
|
||||||
ax.set_xticklabels(model_labels, rotation=60)
|
models = df.iloc[:, 0]
|
||||||
|
model_map = {
|
||||||
|
"gpt-4-0613": "gpt-4-\n0613",
|
||||||
|
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
|
||||||
|
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
|
||||||
|
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
|
||||||
|
}
|
||||||
|
model_labels = []
|
||||||
|
for model in models:
|
||||||
|
ml = model_map.get(model, model)
|
||||||
|
model_labels.append(ml)
|
||||||
|
ax.set_xticklabels(model_labels, rotation=0)
|
||||||
|
|
||||||
top = 95
|
top = 95
|
||||||
ax.annotate(
|
ax.annotate(
|
||||||
"First attempt,\nbased on\nnatural language\ninstructions",
|
"First attempt,\nbased on\nnatural language\ninstructions",
|
||||||
xy=(2.0, 41),
|
xy=(1.0, 53),
|
||||||
xytext=(1.75, top),
|
xytext=(0.75, top),
|
||||||
horizontalalignment="center",
|
horizontalalignment="center",
|
||||||
verticalalignment="top",
|
verticalalignment="top",
|
||||||
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
|
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
|
||||||
)
|
)
|
||||||
ax.annotate(
|
ax.annotate(
|
||||||
"Second attempt,\nincluding unit test\nerror output",
|
"Second attempt,\nincluding unit test\nerror output",
|
||||||
xy=(2.55, 56),
|
xy=(1.55, 65),
|
||||||
xytext=(3.9, top),
|
xytext=(1.9, top),
|
||||||
horizontalalignment="center",
|
horizontalalignment="center",
|
||||||
verticalalignment="top",
|
verticalalignment="top",
|
||||||
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
|
arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
|
||||||
|
@ -442,7 +440,7 @@ def plot_refactoring(df):
|
||||||
for grouped in tries:
|
for grouped in tries:
|
||||||
zorder += 1
|
zorder += 1
|
||||||
df = grouped.unstack()
|
df = grouped.unstack()
|
||||||
df.sort_values(by=["model"], ascending=False, inplace=True)
|
# df.sort_values(by=["model"], ascending=False, inplace=True)
|
||||||
num_models, num_formats = df.shape
|
num_models, num_formats = df.shape
|
||||||
|
|
||||||
pos = np.array(range(num_models))
|
pos = np.array(range(num_models))
|
||||||
|
@ -482,6 +480,12 @@ def plot_refactoring(df):
|
||||||
if zorder == 2:
|
if zorder == 2:
|
||||||
edge["label"] = label
|
edge["label"] = label
|
||||||
|
|
||||||
|
color = [
|
||||||
|
"#b3e6a8",
|
||||||
|
"#b3e6a8",
|
||||||
|
"#b3d1e6",
|
||||||
|
]
|
||||||
|
|
||||||
rects = ax.bar(
|
rects = ax.bar(
|
||||||
pos + i * width,
|
pos + i * width,
|
||||||
df[fmt],
|
df[fmt],
|
||||||
|
@ -495,17 +499,28 @@ def plot_refactoring(df):
|
||||||
if zorder == 2:
|
if zorder == 2:
|
||||||
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
|
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
|
||||||
|
|
||||||
ax.set_xticks([p + 0.5 * width for p in pos])
|
ax.set_xticks([p + 0 * width for p in pos])
|
||||||
ax.set_xticklabels(models)
|
|
||||||
|
model_map = {
|
||||||
|
"gpt-4-0125-preview": "gpt-4-\n0125-preview",
|
||||||
|
"gpt-4-1106-preview": "gpt-4-\n1106-preview",
|
||||||
|
"gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
|
||||||
|
}
|
||||||
|
model_labels = []
|
||||||
|
for model in models:
|
||||||
|
ml = model_map.get(model, model)
|
||||||
|
model_labels.append(ml)
|
||||||
|
|
||||||
|
ax.set_xticklabels(model_labels, rotation=0)
|
||||||
|
|
||||||
ax.set_ylabel("Percent of exercises completed successfully")
|
ax.set_ylabel("Percent of exercises completed successfully")
|
||||||
# ax.set_xlabel("Model")
|
# ax.set_xlabel("Model")
|
||||||
ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
|
ax.set_title('Refactoring "Laziness" Benchmark')
|
||||||
ax.legend(
|
# ax.legend(
|
||||||
# title="Edit Format",
|
# title="Edit Format",
|
||||||
loc="upper left",
|
# loc="upper left",
|
||||||
# bbox_to_anchor=(0.95, 0.95),
|
# bbox_to_anchor=(0.95, 0.95),
|
||||||
)
|
# )
|
||||||
ax.set_ylim(top=100)
|
ax.set_ylim(top=100)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue