mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-28 00:05:01 +00:00
copy
This commit is contained in:
parent
9767759033
commit
dd9b2a872c
2 changed files with 267 additions and 1 deletions
259
aider/website/_data/o1_polyglot_leaderboard.yml
Normal file
259
aider/website/_data/o1_polyglot_leaderboard.yml
Normal file
|
@ -0,0 +1,259 @@
|
|||
- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
|
||||
test_cases: 225
|
||||
model: gpt-4o-mini-2024-07-18
|
||||
edit_format: whole
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 0.9
|
||||
pass_rate_2: 3.6
|
||||
pass_num_1: 2
|
||||
pass_num_2: 8
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 36
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
total_tests: 225
|
||||
command: aider --model gpt-4o-mini-2024-07-18
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 17.3
|
||||
total_cost: 0.3236
|
||||
|
||||
- dirname: 2024-12-21-18-44-28--polyglot-sonnet
|
||||
test_cases: 225
|
||||
model: claude-3-5-sonnet-20241022
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 18.7
|
||||
pass_rate_2: 45.3
|
||||
pass_num_1: 42
|
||||
pass_num_2: 102
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 14
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 12
|
||||
total_tests: 225
|
||||
command: aider --model claude-3-5-sonnet-20241022
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 30.8
|
||||
total_cost: 13.4847
|
||||
|
||||
- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
|
||||
test_cases: 225
|
||||
model: gpt-4o-2024-11-20
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 4.9
|
||||
pass_rate_2: 15.1
|
||||
pass_num_1: 11
|
||||
pass_num_2: 34
|
||||
percent_cases_well_formed: 96.0
|
||||
error_outputs: 12
|
||||
num_malformed_responses: 11
|
||||
num_with_malformed_responses: 9
|
||||
user_asks: 34
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 19
|
||||
total_tests: 225
|
||||
command: aider --model gpt-4o-2024-11-20
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 22.2
|
||||
total_cost: 7.1835
|
||||
|
||||
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
|
||||
test_cases: 224
|
||||
model: o1-2024-12-17 (high)
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 23.7
|
||||
pass_rate_2: 61.7
|
||||
pass_num_1: 53
|
||||
pass_num_2: 139
|
||||
percent_cases_well_formed: 91.5
|
||||
error_outputs: 25
|
||||
num_malformed_responses: 24
|
||||
num_with_malformed_responses: 19
|
||||
user_asks: 16
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
total_tests: 225
|
||||
command: aider --model openrouter/openai/o1
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 133.2
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
|
||||
test_cases: 225
|
||||
model: deepseek-chat
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 5.3
|
||||
pass_rate_2: 17.8
|
||||
pass_num_1: 12
|
||||
pass_num_2: 40
|
||||
percent_cases_well_formed: 92.9
|
||||
error_outputs: 42
|
||||
num_malformed_responses: 37
|
||||
num_with_malformed_responses: 16
|
||||
user_asks: 23
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 5
|
||||
test_timeouts: 5
|
||||
total_tests: 225
|
||||
command: aider --model deepseek/deepseek-chat
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 184.0
|
||||
total_cost: 0.5101
|
||||
|
||||
- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
|
||||
test_cases: 225
|
||||
model: claude-3-5-haiku-20241022
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 7.1
|
||||
pass_rate_2: 28.0
|
||||
pass_num_1: 16
|
||||
pass_num_2: 63
|
||||
percent_cases_well_formed: 91.1
|
||||
error_outputs: 31
|
||||
num_malformed_responses: 30
|
||||
num_with_malformed_responses: 20
|
||||
user_asks: 13
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 9
|
||||
total_tests: 225
|
||||
command: aider --model claude-3-5-haiku-20241022
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 31.8
|
||||
total_cost: 6.0583
|
||||
|
||||
- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
|
||||
test_cases: 225
|
||||
model: Qwen2.5-Coder-32B-Instruct
|
||||
edit_format: diff
|
||||
commit_hash: 6d7e8be-dirty
|
||||
pass_rate_1: 4.4
|
||||
pass_rate_2: 8.0
|
||||
pass_num_1: 10
|
||||
pass_num_2: 18
|
||||
percent_cases_well_formed: 71.6
|
||||
error_outputs: 158
|
||||
num_malformed_responses: 148
|
||||
num_with_malformed_responses: 64
|
||||
user_asks: 132
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 2
|
||||
total_tests: 225
|
||||
command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 84.4
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
|
||||
test_cases: 225
|
||||
model: o1-mini-2024-09-12
|
||||
edit_format: whole
|
||||
commit_hash: 37df899
|
||||
pass_rate_1: 5.8
|
||||
pass_rate_2: 32.9
|
||||
pass_num_1: 13
|
||||
pass_num_2: 74
|
||||
percent_cases_well_formed: 96.9
|
||||
error_outputs: 8
|
||||
num_malformed_responses: 8
|
||||
num_with_malformed_responses: 7
|
||||
user_asks: 27
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
total_tests: 225
|
||||
command: aider --model o1-mini
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 34.7
|
||||
total_cost: 18.5770
|
||||
|
||||
- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
|
||||
test_cases: 225
|
||||
model: gemini-exp-1206
|
||||
edit_format: whole
|
||||
commit_hash: b1bc2f8
|
||||
pass_rate_1: 19.6
|
||||
pass_rate_2: 38.2
|
||||
pass_num_1: 44
|
||||
pass_num_2: 86
|
||||
percent_cases_well_formed: 98.2
|
||||
error_outputs: 8
|
||||
num_malformed_responses: 8
|
||||
num_with_malformed_responses: 4
|
||||
user_asks: 32
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 9
|
||||
total_tests: 225
|
||||
command: aider --model gemini/gemini-exp-1206
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 45.5
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
|
||||
test_cases: 225
|
||||
model: gemini-2.0-flash-exp
|
||||
edit_format: whole
|
||||
commit_hash: b1bc2f8
|
||||
pass_rate_1: 11.6
|
||||
pass_rate_2: 22.2
|
||||
pass_num_1: 26
|
||||
pass_num_2: 50
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 9
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 8
|
||||
total_tests: 225
|
||||
command: aider --model gemini/gemini-2.0-flash-exp
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 12.2
|
||||
total_cost: 0.0000
|
|
@ -28,6 +28,13 @@ the performance of
|
|||
today's strongest coding models and
|
||||
leaves headroom for future LLMs.
|
||||
|
||||
{: .note :}
|
||||
See the main
|
||||
[aider leaderboard](https://aider.chat/docs/leaderboards/)
|
||||
for benchmark results from more models.
|
||||
This article only contains a snapshot
|
||||
of results at the time of publication.
|
||||
|
||||
## The polyglot benchmark
|
||||
|
||||
Like aider's original code editing benchmark,
|
||||
|
@ -171,7 +178,7 @@ on GitHub.
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
|
||||
{% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
|
||||
{% for row in edit_sorted %}
|
||||
<tr style="border-bottom: 1px solid #ddd;">
|
||||
<td style="padding: 8px;">{{ row.model }}</td>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue