From 96587f5f46ce1644dabfaca232a0d88b07d237d2 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 12 Sep 2024 14:07:06 -0700 Subject: [PATCH] o1-mini blog article --- aider/website/_config.yml | 6 +- aider/website/_data/o1_results.yml | 94 +++++++++++++++++++++++++++ aider/website/_posts/2024-09-12-o1.md | 82 +++++++++++++++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 aider/website/_data/o1_results.yml create mode 100644 aider/website/_posts/2024-09-12-o1.md diff --git a/aider/website/_config.yml b/aider/website/_config.yml index b95a2a158..d8fff9c5a 100644 --- a/aider/website/_config.yml +++ b/aider/website/_config.yml @@ -41,4 +41,8 @@ repository: paul-gauthier/aider callouts: tip: title: Tip - color: green \ No newline at end of file + color: green + note: + title: Note + color: yellow + \ No newline at end of file diff --git a/aider/website/_data/o1_results.yml b/aider/website/_data/o1_results.yml new file mode 100644 index 000000000..90eff18f2 --- /dev/null +++ b/aider/website/_data/o1_results.yml @@ -0,0 +1,94 @@ +- dirname: 2024-07-18-18-57-46--gpt-4o-mini-whole + test_cases: 133 + model: gpt-4o-mini + edit_format: whole + commit_hash: d31eef3-dirty + pass_rate_1: 40.6 + pass_rate_2: 55.6 + released: 2024-07-18 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 1 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model gpt-4o-mini + date: 2024-07-18 + versions: 0.44.1-dev + seconds_per_case: 7.8 + total_cost: 0.0916 + +- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue + test_cases: 133 + model: claude-3.5-sonnet + edit_format: diff + commit_hash: 35f21b5 + pass_rate_1: 57.1 + pass_rate_2: 77.4 + percent_cases_well_formed: 99.2 + error_outputs: 23 + released: 2024-06-20 + num_malformed_responses: 4 + num_with_malformed_responses: 1 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --sonnet + date: 2024-07-04 + versions: 0.42.1-dev + seconds_per_case: 17.6 + total_cost: 3.6346 + +- dirname: 2024-08-06-18-28-39--gpt-4o-2024-08-06-diff-again + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: diff + commit_hash: ed9ed89 + pass_rate_1: 57.1 + pass_rate_2: 71.4 + percent_cases_well_formed: 98.5 + error_outputs: 18 + num_malformed_responses: 2 + num_with_malformed_responses: 2 + user_asks: 10 + lazy_comments: 0 + syntax_errors: 6 + indentation_errors: 2 + exhausted_context_windows: 0 + test_timeouts: 5 + released: 2024-08-06 + command: aider --model openai/gpt-4o-2024-08-06 + date: 2024-08-06 + versions: 0.48.1-dev + seconds_per_case: 6.5 + total_cost: 0.0000 + +- dirname: 2024-09-12-19-57-35--o1-mini-whole + test_cases: 133 + model: o1-mini + edit_format: whole + commit_hash: 36fa773-dirty, 291b456 + pass_rate_1: 49.6 + pass_rate_2: 70.7 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 17 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model openai/o1-mini + date: 2024-09-12 + versions: 0.56.1.dev + seconds_per_case: 103.0 + total_cost: 5.3725 \ No newline at end of file diff --git a/aider/website/_posts/2024-09-12-o1.md b/aider/website/_posts/2024-09-12-o1.md new file mode 100644 index 000000000..89e8d1f90 --- /dev/null +++ b/aider/website/_posts/2024-09-12-o1.md @@ -0,0 +1,82 @@ +--- +title: Benchmark results for OpenAI o1-mini +excerpt: Preliminary benchmark results for the new OpenAI o1-mini model. +nav_exclude: true +--- +{% if page.date %} +

{{ page.date | date: "%B %d, %Y" }}

+{% endif %} + +# Benchmark results for OpenAI o1-mini + +OpenAI o1-mini is priced similarly to GPT-4o and Claude 3.5 Sonnet. +o1-mini scored below those models +when using the simple "whole" editing format. +It was close enough to GPT-4o to be within the margin of error. + +The o1-mini model had trouble following the very simple whole editing format. +It's possible that it would get a better score if aider prompted with +more examples or was adapted to parse o1-mini's favorite way to mangle +the response format. + +Note that o1-mini's "whole" score is compared against GPT-4o and Sonnet +"diff" results. +Using diff is more challenging for GPT-4o and Sonnet, +but it allows them to return search/replace blocks to +efficiently edit the source code. +The whole format requires the o1-mini to return a fresh copy of the entire file, +increasing costs and latency. + + +{: .note } +> These are *preliminiary* benchmark results, which will be updated as +> additional benchmark runs complete and rate limits open up. + + + + + + + + + + + + + {% assign edit_sorted = site.data.o1_results | sort: 'pass_rate_2' | reverse %} + {% for row in edit_sorted %} + + + + + + + + {% endfor %} + +
ModelPercent completed correctlyPercent using correct edit formatCommandEdit format
{{ row.model }}{{ row.pass_rate_2 }}%{{ row.percent_cases_well_formed }}%{{ row.command }}{{ row.edit_format }}
+ + + +{% include leaderboard_graph.html + chart_id="editChart" + data=edit_sorted + row_prefix="edit-row" + pass_rate_key="pass_rate_2" +%} + +