mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 01:35:00 +00:00
Merge branch 'main' into patch-1
This commit is contained in:
commit
f1e623ec5a
6 changed files with 304 additions and 4 deletions
259
aider/website/_data/o1_polyglot_leaderboard.yml
Normal file
259
aider/website/_data/o1_polyglot_leaderboard.yml
Normal file
|
@ -0,0 +1,259 @@
|
|||
- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
|
||||
test_cases: 225
|
||||
model: gpt-4o-mini-2024-07-18
|
||||
edit_format: whole
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 0.9
|
||||
pass_rate_2: 3.6
|
||||
pass_num_1: 2
|
||||
pass_num_2: 8
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 36
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
total_tests: 225
|
||||
command: aider --model gpt-4o-mini-2024-07-18
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 17.3
|
||||
total_cost: 0.3236
|
||||
|
||||
- dirname: 2024-12-21-18-44-28--polyglot-sonnet
|
||||
test_cases: 225
|
||||
model: claude-3-5-sonnet-20241022
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 18.7
|
||||
pass_rate_2: 45.3
|
||||
pass_num_1: 42
|
||||
pass_num_2: 102
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 14
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 12
|
||||
total_tests: 225
|
||||
command: aider --model claude-3-5-sonnet-20241022
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 30.8
|
||||
total_cost: 13.4847
|
||||
|
||||
- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
|
||||
test_cases: 225
|
||||
model: gpt-4o-2024-11-20
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 4.9
|
||||
pass_rate_2: 15.1
|
||||
pass_num_1: 11
|
||||
pass_num_2: 34
|
||||
percent_cases_well_formed: 96.0
|
||||
error_outputs: 12
|
||||
num_malformed_responses: 11
|
||||
num_with_malformed_responses: 9
|
||||
user_asks: 34
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 19
|
||||
total_tests: 225
|
||||
command: aider --model gpt-4o-2024-11-20
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 22.2
|
||||
total_cost: 7.1835
|
||||
|
||||
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
|
||||
test_cases: 224
|
||||
model: o1-2024-12-17 (high)
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 23.7
|
||||
pass_rate_2: 61.7
|
||||
pass_num_1: 53
|
||||
pass_num_2: 139
|
||||
percent_cases_well_formed: 91.5
|
||||
error_outputs: 25
|
||||
num_malformed_responses: 24
|
||||
num_with_malformed_responses: 19
|
||||
user_asks: 16
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
total_tests: 225
|
||||
command: aider --model openrouter/openai/o1
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 133.2
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
|
||||
test_cases: 225
|
||||
model: DeepSeek Chat V2.5
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 5.3
|
||||
pass_rate_2: 17.8
|
||||
pass_num_1: 12
|
||||
pass_num_2: 40
|
||||
percent_cases_well_formed: 92.9
|
||||
error_outputs: 42
|
||||
num_malformed_responses: 37
|
||||
num_with_malformed_responses: 16
|
||||
user_asks: 23
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 5
|
||||
test_timeouts: 5
|
||||
total_tests: 225
|
||||
command: aider --model deepseek/deepseek-chat
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 184.0
|
||||
total_cost: 0.5101
|
||||
|
||||
- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
|
||||
test_cases: 225
|
||||
model: claude-3-5-haiku-20241022
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 7.1
|
||||
pass_rate_2: 28.0
|
||||
pass_num_1: 16
|
||||
pass_num_2: 63
|
||||
percent_cases_well_formed: 91.1
|
||||
error_outputs: 31
|
||||
num_malformed_responses: 30
|
||||
num_with_malformed_responses: 20
|
||||
user_asks: 13
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 9
|
||||
total_tests: 225
|
||||
command: aider --model claude-3-5-haiku-20241022
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 31.8
|
||||
total_cost: 6.0583
|
||||
|
||||
- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
|
||||
test_cases: 225
|
||||
model: Qwen2.5-Coder-32B-Instruct
|
||||
edit_format: diff
|
||||
commit_hash: 6d7e8be-dirty
|
||||
pass_rate_1: 4.4
|
||||
pass_rate_2: 8.0
|
||||
pass_num_1: 10
|
||||
pass_num_2: 18
|
||||
percent_cases_well_formed: 71.6
|
||||
error_outputs: 158
|
||||
num_malformed_responses: 148
|
||||
num_with_malformed_responses: 64
|
||||
user_asks: 132
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 2
|
||||
total_tests: 225
|
||||
command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 84.4
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
|
||||
test_cases: 225
|
||||
model: o1-mini-2024-09-12
|
||||
edit_format: whole
|
||||
commit_hash: 37df899
|
||||
pass_rate_1: 5.8
|
||||
pass_rate_2: 32.9
|
||||
pass_num_1: 13
|
||||
pass_num_2: 74
|
||||
percent_cases_well_formed: 96.9
|
||||
error_outputs: 8
|
||||
num_malformed_responses: 8
|
||||
num_with_malformed_responses: 7
|
||||
user_asks: 27
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
total_tests: 225
|
||||
command: aider --model o1-mini
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 34.7
|
||||
total_cost: 18.5770
|
||||
|
||||
- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
|
||||
test_cases: 225
|
||||
model: gemini-exp-1206
|
||||
edit_format: whole
|
||||
commit_hash: b1bc2f8
|
||||
pass_rate_1: 19.6
|
||||
pass_rate_2: 38.2
|
||||
pass_num_1: 44
|
||||
pass_num_2: 86
|
||||
percent_cases_well_formed: 98.2
|
||||
error_outputs: 8
|
||||
num_malformed_responses: 8
|
||||
num_with_malformed_responses: 4
|
||||
user_asks: 32
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 9
|
||||
total_tests: 225
|
||||
command: aider --model gemini/gemini-exp-1206
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 45.5
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
|
||||
test_cases: 225
|
||||
model: gemini-2.0-flash-exp
|
||||
edit_format: whole
|
||||
commit_hash: b1bc2f8
|
||||
pass_rate_1: 11.6
|
||||
pass_rate_2: 22.2
|
||||
pass_num_1: 26
|
||||
pass_num_2: 50
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 9
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 8
|
||||
total_tests: 225
|
||||
command: aider --model gemini/gemini-2.0-flash-exp
|
||||
date: 2024-12-22
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 12.2
|
||||
total_cost: 0.0000
|
|
@ -104,7 +104,7 @@
|
|||
|
||||
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
|
||||
test_cases: 225
|
||||
model: deepseek-chat
|
||||
model: DeepSeek Chat V2.5
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 5.3
|
||||
|
@ -283,3 +283,30 @@
|
|||
versions: 0.69.2.dev
|
||||
seconds_per_case: 146.7
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
|
||||
test_cases: 225
|
||||
model: DeepSeek Chat V3 Preview
|
||||
edit_format: diff
|
||||
commit_hash: 0a23c4a-dirty
|
||||
pass_rate_1: 22.7
|
||||
pass_rate_2: 48.4
|
||||
pass_num_1: 51
|
||||
pass_num_2: 109
|
||||
percent_cases_well_formed: 98.7
|
||||
error_outputs: 7
|
||||
num_malformed_responses: 7
|
||||
num_with_malformed_responses: 3
|
||||
user_asks: 19
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 8
|
||||
total_tests: 225
|
||||
command: aider --model deepseek/deepseek-chat
|
||||
date: 2024-12-25
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 34.8
|
||||
total_cost: 0.3369
|
||||
|
||||
|
|
|
@ -21,13 +21,20 @@ new
|
|||
other top LLMs.
|
||||
The new polyglot benchmark uses many popular coding languages
|
||||
and was designed to be
|
||||
*much more challenging* than aider's old
|
||||
*much more challenging* than aider's original
|
||||
[code editing benchmark](/docs/leaderboards/edit.html).
|
||||
This more clearly distinguishes
|
||||
the performance of
|
||||
today's strongest coding models and
|
||||
leaves headroom for future LLMs.
|
||||
|
||||
{: .note :}
|
||||
See the main
|
||||
[aider leaderboard](https://aider.chat/docs/leaderboards/)
|
||||
for benchmark results from more models.
|
||||
This article only contains a snapshot
|
||||
of results at the time of publication.
|
||||
|
||||
## The polyglot benchmark
|
||||
|
||||
Like aider's original code editing benchmark,
|
||||
|
@ -171,7 +178,7 @@ on GitHub.
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
|
||||
{% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
|
||||
{% for row in edit_sorted %}
|
||||
<tr style="border-bottom: 1px solid #ddd;">
|
||||
<td style="padding: 8px;">{{ row.model }}</td>
|
||||
|
|
|
@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with
|
|||
</tbody>
|
||||
</table>
|
||||
|
||||
### Aider polyglot benchmark results
|
||||
|
||||
<canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
|
||||
<script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script>
|
||||
{% assign data_source = edit_sorted %}
|
||||
{% assign pass_rate_field = "pass_rate_2" %}
|
||||
{% assign highlight_model = "xxxxxxxxxxx" %}
|
||||
{% include leaderboard.js %}
|
||||
</script>
|
||||
<style>
|
||||
|
|
|
@ -7,6 +7,10 @@ docker run \
|
|||
-v `pwd`/tmp.benchmarks/.:/benchmarks \
|
||||
-e OPENAI_API_KEY=$OPENAI_API_KEY \
|
||||
-e HISTFILE=/aider/.bash_history \
|
||||
-e PROMPT_COMMAND='history -a' \
|
||||
-e HISTCONTROL=ignoredups \
|
||||
-e HISTSIZE=10000 \
|
||||
-e HISTFILESIZE=20000 \
|
||||
-e AIDER_DOCKER=1 \
|
||||
-e AIDER_BENCHMARK_DIR=/benchmarks \
|
||||
aider-benchmark \
|
||||
|
|
|
@ -25,7 +25,7 @@ rsync -avz --delete \
|
|||
"$REPO_ROOT/" \
|
||||
"$DEST:~/aider/"
|
||||
|
||||
rsync -a .env .bash_history .gitignore "$DEST:~/aider/."
|
||||
rsync -a .env .gitignore "$DEST:~/aider/."
|
||||
|
||||
rsync -a ~/dotfiles/screenrc "$DEST:.screenrc"
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue