update o1-preview leaderboard to diff only

This commit is contained in:
Paul Gauthier 2024-09-21 10:27:50 -07:00
parent b27738d39a
commit 565c305aa6
2 changed files with 17 additions and 38 deletions

View file

@ -6,6 +6,8 @@
- Support for OpenAI o1 models:
- `aider --model o1-mini`
- `aider --model o1-preview`
- o1-preview now works well with diff edit format.
- o1-preview with diff now matches SOTA leaderboard result with whole edit format.
- On Windows, `/run` correctly uses PowerShell or cmd.exe.
- Support for new 08-2024 Cohere models, by @jalammar.
- Can now recursively add directories with `/read-only`.

View file

@ -1133,52 +1133,29 @@
seconds_per_case: 177.7
total_cost: 11.1071
- dirname: 2024-09-12-22-44-14--o1-preview-diff
- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers
test_cases: 133
model: o1-preview (diff)
model: o1-preview
edit_format: diff
commit_hash: 72f52bd
pass_rate_1: 56.4
pass_rate_2: 75.2
percent_cases_well_formed: 84.2
error_outputs: 27
num_malformed_responses: 27
num_with_malformed_responses: 21
user_asks: 8
lazy_comments: 0
syntax_errors: 7
indentation_errors: 3
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model o1-preview
date: 2024-09-12
versions: 0.56.1.dev
seconds_per_case: 95.8
total_cost: 71.7927
- dirname: 2024-09-13-02-13-59--o1-preview-whole
test_cases: 133
model: o1-preview (whole)
edit_format: whole
commit_hash: 72f52bd-dirty
pass_rate_1: 58.6
commit_hash: 5493654-dirty
pass_rate_1: 57.9
pass_rate_2: 79.7
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 2
percent_cases_well_formed: 93.2
error_outputs: 11
num_malformed_responses: 11
num_with_malformed_responses: 9
user_asks: 3
lazy_comments: 0
syntax_errors: 1
syntax_errors: 10
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
test_timeouts: 1
command: aider --model o1-preview
date: 2024-09-13
date: 2024-09-21
versions: 0.56.1.dev
seconds_per_case: 47.4
total_cost: 38.0612
seconds_per_case: 80.9
total_cost: 63.9190
- dirname: 2024-09-19-16-58-29--qwen2.5-coder:7b-instruct-q8_0
test_cases: 133
model: qwen2.5-coder:7b-instruct-q8_0