mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-28 16:25:00 +00:00
Initial benchmark results for 0125
This commit is contained in:
parent
13e9a01f3b
commit
9033be74bf
3 changed files with 1583 additions and 3 deletions
1537
assets/benchmarks-0125.svg
Normal file
1537
assets/benchmarks-0125.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 38 KiB |
|
@ -303,6 +303,7 @@ def plot_refactoring(df):
|
|||
for grouped in tries:
|
||||
zorder += 1
|
||||
df = grouped.unstack()
|
||||
df.sort_values(by=["model"], ascending=False, inplace=True)
|
||||
num_models, num_formats = df.shape
|
||||
|
||||
pos = np.array(range(num_models))
|
||||
|
@ -311,13 +312,15 @@ def plot_refactoring(df):
|
|||
formats = df.columns
|
||||
models = df.index
|
||||
|
||||
dump(df)
|
||||
dump(models)
|
||||
dump(formats)
|
||||
for i, fmt in enumerate(formats):
|
||||
hatch = ""
|
||||
|
||||
if fmt == "diff":
|
||||
color = "#b3e6a8"
|
||||
label = "Baseline (search/replace blocks)"
|
||||
label = "Search/replace blocks"
|
||||
elif fmt == "udiff":
|
||||
color = "#b3d1e6"
|
||||
label = "Unified diffs"
|
||||
|
@ -353,7 +356,7 @@ def plot_refactoring(df):
|
|||
if zorder == 2:
|
||||
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
|
||||
|
||||
ax.set_xticks([p + 1.0 * width for p in pos])
|
||||
ax.set_xticks([p + 0.5 * width for p in pos])
|
||||
ax.set_xticklabels(models)
|
||||
|
||||
ax.set_ylabel("Percent of exercises completed successfully")
|
||||
|
@ -794,7 +797,7 @@ def run_test(
|
|||
default_headers={
|
||||
"HTTP-Referer": "http://aider.chat",
|
||||
"X-Title": "Aider",
|
||||
}
|
||||
},
|
||||
)
|
||||
else:
|
||||
client = openai.OpenAI(
|
||||
|
|
40
docs/benchmarks-0125.md
Normal file
40
docs/benchmarks-0125.md
Normal file
|
@ -0,0 +1,40 @@
|
|||
# Code editing benchmarks for OpenAI's "1106" models
|
||||
|
||||
[](https://aider.chat/assets/benchmarks-0125.svg)
|
||||
|
||||
[OpenAI just released a new version of GPT-4 Turbo](https://openai.com/blog/new-embedding-models-and-api-updates).
|
||||
This new model is intended to reduce the "lazy coding" that has been widely observed with the previous `gpt-1106-preview` model:
|
||||
|
||||
> Today, we are releasing an updated GPT-4 Turbo preview model, gpt-4-0125-preview. This model completes tasks like code generation more thoroughly than the previous preview model and is intended to reduce cases of “laziness” where the model doesn’t complete a task.
|
||||
|
||||
With that in mind, I've been benchmarking the new model using
|
||||
aider's existing
|
||||
[lazy coding benchmark](https://aider.chat/docs/unified-diffs.html).
|
||||
|
||||
## Benchmark results
|
||||
|
||||
**These results are currently preliminary, and will be updated as additional benchmark runs complete.**
|
||||
|
||||
The new `gpt-4-0125-preview` model produces mixed results on the
|
||||
lazy coding benchmark as compared to the November `gpt-4-1106-preview` model:
|
||||
|
||||
- It performs much worse when using the [unified diffs](https://aider.chat/docs/unified-diffs.html) code editign format.
|
||||
- Using aider's older SEARCH/REPLACE block editing format, the new January model outperfoms the older November model. But it still performs worse than both models using unified diffs.
|
||||
|
||||
## Related reports
|
||||
|
||||
This is one in a series of reports
|
||||
that use the aider benchmarking suite to assess and compare the code
|
||||
editing capabilities of OpenAI's GPT models.
|
||||
You can review the other reports
|
||||
for additional information:
|
||||
|
||||
- [GPT code editing benchmarks](https://aider.chat/docs/benchmarks.html) evaluates the March and June versions of GPT-3.5 and GPT-4.
|
||||
- [Code editing benchmarks for OpenAI's "1106" models](https://aider.chat/docs/benchmarks-1106.html).
|
||||
- [Aider's lazy coding benchmark](https://aider.chat/docs/unified-diffs.html).
|
||||
|
||||
|
||||
## Updates
|
||||
|
||||
Last updated 11/14/23.
|
||||
OpenAI has relaxed rate limits so these results are no longer considered preliminary.
|
Loading…
Add table
Add a link
Reference in a new issue