diff --git a/aider/website/_data/edit_leaderboard.yml b/aider/website/_data/edit_leaderboard.yml index 8ed43927d..95d07f2b0 100644 --- a/aider/website/_data/edit_leaderboard.yml +++ b/aider/website/_data/edit_leaderboard.yml @@ -1089,7 +1089,7 @@ - dirname: 2024-09-12-19-57-35--o1-mini-whole test_cases: 133 - model: o1-mini + model: o1-mini (whole) edit_format: whole commit_hash: 36fa773-dirty, 291b456 pass_rate_1: 49.6 @@ -1108,4 +1108,28 @@ date: 2024-09-12 versions: 0.56.1.dev seconds_per_case: 103.0 - total_cost: 5.3725 \ No newline at end of file + total_cost: 5.3725 + +- dirname: 2024-09-12-20-56-22--o1-mini-diff + test_cases: 133 + model: o1-mini (diff) + edit_format: diff + commit_hash: 4598a37-dirty, 291b456, 752e823-dirty + pass_rate_1: 45.1 + pass_rate_2: 62.4 + percent_cases_well_formed: 85.7 + error_outputs: 26 + num_malformed_responses: 26 + num_with_malformed_responses: 19 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model o1-mini --edit-format diff + date: 2024-09-12 + versions: 0.56.1.dev + seconds_per_case: 177.7 + total_cost: 11.1071 + \ No newline at end of file diff --git a/aider/website/_data/o1_results.yml b/aider/website/_data/o1_results.yml index 96d2ac1d6..292e258a2 100644 --- a/aider/website/_data/o1_results.yml +++ b/aider/website/_data/o1_results.yml @@ -91,4 +91,28 @@ date: 2024-09-12 versions: 0.56.1.dev seconds_per_case: 103.0 - total_cost: 5.3725 \ No newline at end of file + total_cost: 5.3725 + +- dirname: 2024-09-12-20-56-22--o1-mini-diff + test_cases: 133 + model: o1-mini (diff) + edit_format: diff + commit_hash: 4598a37-dirty, 291b456, 752e823-dirty + pass_rate_1: 45.1 + pass_rate_2: 62.4 + percent_cases_well_formed: 85.7 + error_outputs: 26 + num_malformed_responses: 26 + num_with_malformed_responses: 19 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model o1-mini --edit-format diff + date: 2024-09-12 + versions: 0.56.1.dev + seconds_per_case: 177.7 + total_cost: 11.1071 + \ No newline at end of file diff --git a/aider/website/_posts/2024-09-12-o1.md b/aider/website/_posts/2024-09-12-o1.md index e9d087a91..96b58c097 100644 --- a/aider/website/_posts/2024-09-12-o1.md +++ b/aider/website/_posts/2024-09-12-o1.md @@ -10,23 +10,26 @@ nav_exclude: true # Benchmark results for OpenAI o1-mini OpenAI o1-mini is priced similarly to GPT-4o and Claude 3.5 Sonnet, -but scored below those models -when using the "whole" editing format. -It was close enough to GPT-4o to be within the margin of error. +but scored below those models. -The o1-mini model had trouble following the very simple whole editing format. -It's possible that it would get a better score if aider prompted with -more examples or was adapted to parse o1-mini's favorite way to mangle -the response format. +It works best with the +["whole" edit format](/docs/leaderboards/#notes-on-the-edit-format), +where it returns a full copy of the source code file with changes. +Other frontier models like GPT-4o and Sonnet are able to achieve +high benchmark scores using the +["diff" edit format](/docs/leaderboards/#notes-on-the-edit-format), +This allows them to return search/replace blocks to +efficiently edit the source code, saving time and token costs. -Note that o1-mini's "whole" score is compared against GPT-4o and Sonnet -"diff" results. -Using diff is more challenging, -but allows the model to return search/replace blocks to -efficiently edit the source code. -The whole format requires the o1-mini to return a fresh copy of the entire file, -increasing costs and latency. +The o1-mini model had trouble conforming to both the whole and diff edit formats. +Aider is extremely permissive and tries hard to accept anything close +to the correct formats. +It's possible that o1-mini would get better scores if aider prompted with +more examples or was adapted to parse o1-mini's favorite ways to mangle +the response formats. +Over time it may be possible to better harness o1-mini's capabilities through +different prompting and editing formats. ## Using aider with o1-mini and o1-preview