From 565c305aa65a0fae8e7ca6e3530e337bd852650d Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Sat, 21 Sep 2024 10:27:50 -0700 Subject: [PATCH] update o1-preview leaderboard to diff only --- HISTORY.md | 2 + aider/website/_data/edit_leaderboard.yml | 53 +++++++----------------- 2 files changed, 17 insertions(+), 38 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ee0c74679..2033bc240 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,8 @@ - Support for OpenAI o1 models: - `aider --model o1-mini` - `aider --model o1-preview` + - o1-preview now works well with diff edit format. + - o1-preview with diff now matches SOTA leaderboard result with whole edit format. - On Windows, `/run` correctly uses PowerShell or cmd.exe. - Support for new 08-2024 Cohere models, by @jalammar. - Can now recursively add directories with `/read-only`. diff --git a/aider/website/_data/edit_leaderboard.yml b/aider/website/_data/edit_leaderboard.yml index 4b538e798..5d85799df 100644 --- a/aider/website/_data/edit_leaderboard.yml +++ b/aider/website/_data/edit_leaderboard.yml @@ -1133,52 +1133,29 @@ seconds_per_case: 177.7 total_cost: 11.1071 -- dirname: 2024-09-12-22-44-14--o1-preview-diff +- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers test_cases: 133 - model: o1-preview (diff) + model: o1-preview edit_format: diff - commit_hash: 72f52bd - pass_rate_1: 56.4 - pass_rate_2: 75.2 - percent_cases_well_formed: 84.2 - error_outputs: 27 - num_malformed_responses: 27 - num_with_malformed_responses: 21 - user_asks: 8 - lazy_comments: 0 - syntax_errors: 7 - indentation_errors: 3 - exhausted_context_windows: 0 - test_timeouts: 3 - command: aider --model o1-preview - date: 2024-09-12 - versions: 0.56.1.dev - seconds_per_case: 95.8 - total_cost: 71.7927 - -- dirname: 2024-09-13-02-13-59--o1-preview-whole - test_cases: 133 - model: o1-preview (whole) - edit_format: whole - commit_hash: 72f52bd-dirty - pass_rate_1: 58.6 + commit_hash: 5493654-dirty + pass_rate_1: 57.9 pass_rate_2: 79.7 - percent_cases_well_formed: 100.0 - error_outputs: 0 - num_malformed_responses: 0 - num_with_malformed_responses: 0 - user_asks: 2 + percent_cases_well_formed: 93.2 + error_outputs: 11 + num_malformed_responses: 11 + num_with_malformed_responses: 9 + user_asks: 3 lazy_comments: 0 - syntax_errors: 1 + syntax_errors: 10 indentation_errors: 0 exhausted_context_windows: 0 - test_timeouts: 2 + test_timeouts: 1 command: aider --model o1-preview - date: 2024-09-13 + date: 2024-09-21 versions: 0.56.1.dev - seconds_per_case: 47.4 - total_cost: 38.0612 - + seconds_per_case: 80.9 + total_cost: 63.9190 + - dirname: 2024-09-19-16-58-29--qwen2.5-coder:7b-instruct-q8_0 test_cases: 133 model: qwen2.5-coder:7b-instruct-q8_0