Merge branch 'main' into patch-1

This commit is contained in:
paul-gauthier 2024-12-26 07:05:33 -05:00 committed by GitHub
commit f1e623ec5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 304 additions and 4 deletions

View file

@ -0,0 +1,259 @@
- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
test_cases: 225
model: gpt-4o-mini-2024-07-18
edit_format: whole
commit_hash: a755079-dirty
pass_rate_1: 0.9
pass_rate_2: 3.6
pass_num_1: 2
pass_num_2: 8
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 36
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model gpt-4o-mini-2024-07-18
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 17.3
total_cost: 0.3236
- dirname: 2024-12-21-18-44-28--polyglot-sonnet
test_cases: 225
model: claude-3-5-sonnet-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 18.7
pass_rate_2: 45.3
pass_num_1: 42
pass_num_2: 102
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 14
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 12
total_tests: 225
command: aider --model claude-3-5-sonnet-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 30.8
total_cost: 13.4847
- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
test_cases: 225
model: gpt-4o-2024-11-20
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 4.9
pass_rate_2: 15.1
pass_num_1: 11
pass_num_2: 34
percent_cases_well_formed: 96.0
error_outputs: 12
num_malformed_responses: 11
num_with_malformed_responses: 9
user_asks: 34
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 19
total_tests: 225
command: aider --model gpt-4o-2024-11-20
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 22.2
total_cost: 7.1835
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
test_cases: 224
model: o1-2024-12-17 (high)
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 23.7
pass_rate_2: 61.7
pass_num_1: 53
pass_num_2: 139
percent_cases_well_formed: 91.5
error_outputs: 25
num_malformed_responses: 24
num_with_malformed_responses: 19
user_asks: 16
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model openrouter/openai/o1
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 133.2
total_cost: 0.0000
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
test_cases: 225
model: DeepSeek Chat V2.5
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 5.3
pass_rate_2: 17.8
pass_num_1: 12
pass_num_2: 40
percent_cases_well_formed: 92.9
error_outputs: 42
num_malformed_responses: 37
num_with_malformed_responses: 16
user_asks: 23
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 5
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 184.0
total_cost: 0.5101
- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
test_cases: 225
model: claude-3-5-haiku-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 7.1
pass_rate_2: 28.0
pass_num_1: 16
pass_num_2: 63
percent_cases_well_formed: 91.1
error_outputs: 31
num_malformed_responses: 30
num_with_malformed_responses: 20
user_asks: 13
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 9
total_tests: 225
command: aider --model claude-3-5-haiku-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 31.8
total_cost: 6.0583
- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
test_cases: 225
model: Qwen2.5-Coder-32B-Instruct
edit_format: diff
commit_hash: 6d7e8be-dirty
pass_rate_1: 4.4
pass_rate_2: 8.0
pass_num_1: 10
pass_num_2: 18
percent_cases_well_formed: 71.6
error_outputs: 158
num_malformed_responses: 148
num_with_malformed_responses: 64
user_asks: 132
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 84.4
total_cost: 0.0000
- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
test_cases: 225
model: o1-mini-2024-09-12
edit_format: whole
commit_hash: 37df899
pass_rate_1: 5.8
pass_rate_2: 32.9
pass_num_1: 13
pass_num_2: 74
percent_cases_well_formed: 96.9
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 7
user_asks: 27
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model o1-mini
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 34.7
total_cost: 18.5770
- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
test_cases: 225
model: gemini-exp-1206
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 19.6
pass_rate_2: 38.2
pass_num_1: 44
pass_num_2: 86
percent_cases_well_formed: 98.2
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 4
user_asks: 32
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 9
total_tests: 225
command: aider --model gemini/gemini-exp-1206
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 45.5
total_cost: 0.0000
- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
test_cases: 225
model: gemini-2.0-flash-exp
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 11.6
pass_rate_2: 22.2
pass_num_1: 26
pass_num_2: 50
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 9
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 8
total_tests: 225
command: aider --model gemini/gemini-2.0-flash-exp
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 12.2
total_cost: 0.0000

View file

@ -104,7 +104,7 @@
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
test_cases: 225
model: deepseek-chat
model: DeepSeek Chat V2.5
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 5.3
@ -283,3 +283,30 @@
versions: 0.69.2.dev
seconds_per_case: 146.7
total_cost: 0.0000
- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
test_cases: 225
model: DeepSeek Chat V3 Preview
edit_format: diff
commit_hash: 0a23c4a-dirty
pass_rate_1: 22.7
pass_rate_2: 48.4
pass_num_1: 51
pass_num_2: 109
percent_cases_well_formed: 98.7
error_outputs: 7
num_malformed_responses: 7
num_with_malformed_responses: 3
user_asks: 19
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 8
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-25
versions: 0.69.2.dev
seconds_per_case: 34.8
total_cost: 0.3369

View file

@ -21,13 +21,20 @@ new
other top LLMs.
The new polyglot benchmark uses many popular coding languages
and was designed to be
*much more challenging* than aider's old
*much more challenging* than aider's original
[code editing benchmark](/docs/leaderboards/edit.html).
This more clearly distinguishes
the performance of
today's strongest coding models and
leaves headroom for future LLMs.
{: .note :}
See the main
[aider leaderboard](https://aider.chat/docs/leaderboards/)
for benchmark results from more models.
This article only contains a snapshot
of results at the time of publication.
## The polyglot benchmark
Like aider's original code editing benchmark,
@ -171,7 +178,7 @@ on GitHub.
</tr>
</thead>
<tbody>
{% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
{% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
{% for row in edit_sorted %}
<tr style="border-bottom: 1px solid #ddd;">
<td style="padding: 8px;">{{ row.model }}</td>

View file

@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with
</tbody>
</table>
### Aider polyglot benchmark results
<canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
<script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script>
{% assign data_source = edit_sorted %}
{% assign pass_rate_field = "pass_rate_2" %}
{% assign highlight_model = "xxxxxxxxxxx" %}
{% include leaderboard.js %}
</script>
<style>

View file

@ -7,6 +7,10 @@ docker run \
-v `pwd`/tmp.benchmarks/.:/benchmarks \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e HISTFILE=/aider/.bash_history \
-e PROMPT_COMMAND='history -a' \
-e HISTCONTROL=ignoredups \
-e HISTSIZE=10000 \
-e HISTFILESIZE=20000 \
-e AIDER_DOCKER=1 \
-e AIDER_BENCHMARK_DIR=/benchmarks \
aider-benchmark \

View file

@ -25,7 +25,7 @@ rsync -avz --delete \
"$REPO_ROOT/" \
"$DEST:~/aider/"
rsync -a .env .bash_history .gitignore "$DEST:~/aider/."
rsync -a .env .gitignore "$DEST:~/aider/."
rsync -a ~/dotfiles/screenrc "$DEST:.screenrc"