mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-02 18:54:59 +00:00
update o1-preview leaderboard to diff only
This commit is contained in:
parent
b27738d39a
commit
565c305aa6
2 changed files with 17 additions and 38 deletions
|
@ -6,6 +6,8 @@
|
||||||
- Support for OpenAI o1 models:
|
- Support for OpenAI o1 models:
|
||||||
- `aider --model o1-mini`
|
- `aider --model o1-mini`
|
||||||
- `aider --model o1-preview`
|
- `aider --model o1-preview`
|
||||||
|
- o1-preview now works well with diff edit format.
|
||||||
|
- o1-preview with diff now matches SOTA leaderboard result with whole edit format.
|
||||||
- On Windows, `/run` correctly uses PowerShell or cmd.exe.
|
- On Windows, `/run` correctly uses PowerShell or cmd.exe.
|
||||||
- Support for new 08-2024 Cohere models, by @jalammar.
|
- Support for new 08-2024 Cohere models, by @jalammar.
|
||||||
- Can now recursively add directories with `/read-only`.
|
- Can now recursively add directories with `/read-only`.
|
||||||
|
|
|
@ -1133,52 +1133,29 @@
|
||||||
seconds_per_case: 177.7
|
seconds_per_case: 177.7
|
||||||
total_cost: 11.1071
|
total_cost: 11.1071
|
||||||
|
|
||||||
- dirname: 2024-09-12-22-44-14--o1-preview-diff
|
- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: o1-preview (diff)
|
model: o1-preview
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: 72f52bd
|
commit_hash: 5493654-dirty
|
||||||
pass_rate_1: 56.4
|
pass_rate_1: 57.9
|
||||||
pass_rate_2: 75.2
|
|
||||||
percent_cases_well_formed: 84.2
|
|
||||||
error_outputs: 27
|
|
||||||
num_malformed_responses: 27
|
|
||||||
num_with_malformed_responses: 21
|
|
||||||
user_asks: 8
|
|
||||||
lazy_comments: 0
|
|
||||||
syntax_errors: 7
|
|
||||||
indentation_errors: 3
|
|
||||||
exhausted_context_windows: 0
|
|
||||||
test_timeouts: 3
|
|
||||||
command: aider --model o1-preview
|
|
||||||
date: 2024-09-12
|
|
||||||
versions: 0.56.1.dev
|
|
||||||
seconds_per_case: 95.8
|
|
||||||
total_cost: 71.7927
|
|
||||||
|
|
||||||
- dirname: 2024-09-13-02-13-59--o1-preview-whole
|
|
||||||
test_cases: 133
|
|
||||||
model: o1-preview (whole)
|
|
||||||
edit_format: whole
|
|
||||||
commit_hash: 72f52bd-dirty
|
|
||||||
pass_rate_1: 58.6
|
|
||||||
pass_rate_2: 79.7
|
pass_rate_2: 79.7
|
||||||
percent_cases_well_formed: 100.0
|
percent_cases_well_formed: 93.2
|
||||||
error_outputs: 0
|
error_outputs: 11
|
||||||
num_malformed_responses: 0
|
num_malformed_responses: 11
|
||||||
num_with_malformed_responses: 0
|
num_with_malformed_responses: 9
|
||||||
user_asks: 2
|
user_asks: 3
|
||||||
lazy_comments: 0
|
lazy_comments: 0
|
||||||
syntax_errors: 1
|
syntax_errors: 10
|
||||||
indentation_errors: 0
|
indentation_errors: 0
|
||||||
exhausted_context_windows: 0
|
exhausted_context_windows: 0
|
||||||
test_timeouts: 2
|
test_timeouts: 1
|
||||||
command: aider --model o1-preview
|
command: aider --model o1-preview
|
||||||
date: 2024-09-13
|
date: 2024-09-21
|
||||||
versions: 0.56.1.dev
|
versions: 0.56.1.dev
|
||||||
seconds_per_case: 47.4
|
seconds_per_case: 80.9
|
||||||
total_cost: 38.0612
|
total_cost: 63.9190
|
||||||
|
|
||||||
- dirname: 2024-09-19-16-58-29--qwen2.5-coder:7b-instruct-q8_0
|
- dirname: 2024-09-19-16-58-29--qwen2.5-coder:7b-instruct-q8_0
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: qwen2.5-coder:7b-instruct-q8_0
|
model: qwen2.5-coder:7b-instruct-q8_0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue