From 9033be74bf74ae70459013e54b2ae6a97c47c2e6 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 25 Jan 2024 12:59:15 -0800 Subject: [PATCH] Initial benchmark results for 0125 --- assets/benchmarks-0125.svg | 1537 ++++++++++++++++++++++++++++++++++++ benchmark/benchmark.py | 9 +- docs/benchmarks-0125.md | 40 + 3 files changed, 1583 insertions(+), 3 deletions(-) create mode 100644 assets/benchmarks-0125.svg create mode 100644 docs/benchmarks-0125.md diff --git a/assets/benchmarks-0125.svg b/assets/benchmarks-0125.svg new file mode 100644 index 000000000..7352047d5 --- /dev/null +++ b/assets/benchmarks-0125.svg @@ -0,0 +1,1537 @@ + + + + + + + + 2024-01-25T12:56:50.211276 + image/svg+xml + + + Matplotlib v3.8.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index d091e813e..a8f97fa4a 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -303,6 +303,7 @@ def plot_refactoring(df): for grouped in tries: zorder += 1 df = grouped.unstack() + df.sort_values(by=["model"], ascending=False, inplace=True) num_models, num_formats = df.shape pos = np.array(range(num_models)) @@ -311,13 +312,15 @@ def plot_refactoring(df): formats = df.columns models = df.index + dump(df) + dump(models) dump(formats) for i, fmt in enumerate(formats): hatch = "" if fmt == "diff": color = "#b3e6a8" - label = "Baseline (search/replace blocks)" + label = "Search/replace blocks" elif fmt == "udiff": color = "#b3d1e6" label = "Unified diffs" @@ -353,7 +356,7 @@ def plot_refactoring(df): if zorder == 2: ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6) - ax.set_xticks([p + 1.0 * width for p in pos]) + ax.set_xticks([p + 0.5 * width for p in pos]) ax.set_xticklabels(models) ax.set_ylabel("Percent of exercises completed successfully") @@ -794,7 +797,7 @@ def run_test( default_headers={ "HTTP-Referer": "http://aider.chat", "X-Title": "Aider", - } + }, ) else: client = openai.OpenAI( diff --git a/docs/benchmarks-0125.md b/docs/benchmarks-0125.md new file mode 100644 index 000000000..4cc30ef1e --- /dev/null +++ b/docs/benchmarks-0125.md @@ -0,0 +1,40 @@ +# Code editing benchmarks for OpenAI's "1106" models + +[![benchmark results](../assets/benchmarks-0125.svg)](https://aider.chat/assets/benchmarks-0125.svg) + +[OpenAI just released a new version of GPT-4 Turbo](https://openai.com/blog/new-embedding-models-and-api-updates). +This new model is intended to reduce the "lazy coding" that has been widely observed with the previous `gpt-1106-preview` model: + +> Today, we are releasing an updated GPT-4 Turbo preview model, gpt-4-0125-preview. This model completes tasks like code generation more thoroughly than the previous preview model and is intended to reduce cases of “laziness” where the model doesn’t complete a task. + +With that in mind, I've been benchmarking the new model using +aider's existing +[lazy coding benchmark](https://aider.chat/docs/unified-diffs.html). + +## Benchmark results + +**These results are currently preliminary, and will be updated as additional benchmark runs complete.** + +The new `gpt-4-0125-preview` model produces mixed results on the +lazy coding benchmark as compared to the November `gpt-4-1106-preview` model: + +- It performs much worse when using the [unified diffs](https://aider.chat/docs/unified-diffs.html) code editign format. +- Using aider's older SEARCH/REPLACE block editing format, the new January model outperfoms the older November model. But it still performs worse than both models using unified diffs. + +## Related reports + +This is one in a series of reports +that use the aider benchmarking suite to assess and compare the code +editing capabilities of OpenAI's GPT models. +You can review the other reports +for additional information: + +- [GPT code editing benchmarks](https://aider.chat/docs/benchmarks.html) evaluates the March and June versions of GPT-3.5 and GPT-4. +- [Code editing benchmarks for OpenAI's "1106" models](https://aider.chat/docs/benchmarks-1106.html). +- [Aider's lazy coding benchmark](https://aider.chat/docs/unified-diffs.html). + + +## Updates + +Last updated 11/14/23. +OpenAI has relaxed rate limits so these results are no longer considered preliminary.