Merge branch 'main' into patch-1

This commit is contained in:
paul-gauthier 2024-12-26 07:05:33 -05:00 committed by GitHub
commit f1e623ec5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 304 additions and 4 deletions

View file

@ -0,0 +1,259 @@
- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
test_cases: 225
model: gpt-4o-mini-2024-07-18
edit_format: whole
commit_hash: a755079-dirty
pass_rate_1: 0.9
pass_rate_2: 3.6
pass_num_1: 2
pass_num_2: 8
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 36
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model gpt-4o-mini-2024-07-18
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 17.3
total_cost: 0.3236
- dirname: 2024-12-21-18-44-28--polyglot-sonnet
test_cases: 225
model: claude-3-5-sonnet-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 18.7
pass_rate_2: 45.3
pass_num_1: 42
pass_num_2: 102
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 14
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 12
total_tests: 225
command: aider --model claude-3-5-sonnet-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 30.8
total_cost: 13.4847
- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
test_cases: 225
model: gpt-4o-2024-11-20
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 4.9
pass_rate_2: 15.1
pass_num_1: 11
pass_num_2: 34
percent_cases_well_formed: 96.0
error_outputs: 12
num_malformed_responses: 11
num_with_malformed_responses: 9
user_asks: 34
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 19
total_tests: 225
command: aider --model gpt-4o-2024-11-20
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 22.2
total_cost: 7.1835
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
test_cases: 224
model: o1-2024-12-17 (high)
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 23.7
pass_rate_2: 61.7
pass_num_1: 53
pass_num_2: 139
percent_cases_well_formed: 91.5
error_outputs: 25
num_malformed_responses: 24
num_with_malformed_responses: 19
user_asks: 16
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model openrouter/openai/o1
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 133.2
total_cost: 0.0000
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
test_cases: 225
model: DeepSeek Chat V2.5
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 5.3
pass_rate_2: 17.8
pass_num_1: 12
pass_num_2: 40
percent_cases_well_formed: 92.9
error_outputs: 42
num_malformed_responses: 37
num_with_malformed_responses: 16
user_asks: 23
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 5
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 184.0
total_cost: 0.5101
- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
test_cases: 225
model: claude-3-5-haiku-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 7.1
pass_rate_2: 28.0
pass_num_1: 16
pass_num_2: 63
percent_cases_well_formed: 91.1
error_outputs: 31
num_malformed_responses: 30
num_with_malformed_responses: 20
user_asks: 13
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 9
total_tests: 225
command: aider --model claude-3-5-haiku-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 31.8
total_cost: 6.0583
- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
test_cases: 225
model: Qwen2.5-Coder-32B-Instruct
edit_format: diff
commit_hash: 6d7e8be-dirty
pass_rate_1: 4.4
pass_rate_2: 8.0
pass_num_1: 10
pass_num_2: 18
percent_cases_well_formed: 71.6
error_outputs: 158
num_malformed_responses: 148
num_with_malformed_responses: 64
user_asks: 132
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 84.4
total_cost: 0.0000
- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
test_cases: 225
model: o1-mini-2024-09-12
edit_format: whole
commit_hash: 37df899
pass_rate_1: 5.8
pass_rate_2: 32.9
pass_num_1: 13
pass_num_2: 74
percent_cases_well_formed: 96.9
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 7
user_asks: 27
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model o1-mini
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 34.7
total_cost: 18.5770
- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
test_cases: 225
model: gemini-exp-1206
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 19.6
pass_rate_2: 38.2
pass_num_1: 44
pass_num_2: 86
percent_cases_well_formed: 98.2
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 4
user_asks: 32
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 9
total_tests: 225
command: aider --model gemini/gemini-exp-1206
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 45.5
total_cost: 0.0000
- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
test_cases: 225
model: gemini-2.0-flash-exp
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 11.6
pass_rate_2: 22.2
pass_num_1: 26
pass_num_2: 50
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 9
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 8
total_tests: 225
command: aider --model gemini/gemini-2.0-flash-exp
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 12.2
total_cost: 0.0000

View file

@ -104,7 +104,7 @@
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
test_cases: 225 test_cases: 225
model: deepseek-chat model: DeepSeek Chat V2.5
edit_format: diff edit_format: diff
commit_hash: a755079-dirty commit_hash: a755079-dirty
pass_rate_1: 5.3 pass_rate_1: 5.3
@ -283,3 +283,30 @@
versions: 0.69.2.dev versions: 0.69.2.dev
seconds_per_case: 146.7 seconds_per_case: 146.7
total_cost: 0.0000 total_cost: 0.0000
- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
test_cases: 225
model: DeepSeek Chat V3 Preview
edit_format: diff
commit_hash: 0a23c4a-dirty
pass_rate_1: 22.7
pass_rate_2: 48.4
pass_num_1: 51
pass_num_2: 109
percent_cases_well_formed: 98.7
error_outputs: 7
num_malformed_responses: 7
num_with_malformed_responses: 3
user_asks: 19
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 8
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-25
versions: 0.69.2.dev
seconds_per_case: 34.8
total_cost: 0.3369

View file

@ -21,13 +21,20 @@ new
other top LLMs. other top LLMs.
The new polyglot benchmark uses many popular coding languages The new polyglot benchmark uses many popular coding languages
and was designed to be and was designed to be
*much more challenging* than aider's old *much more challenging* than aider's original
[code editing benchmark](/docs/leaderboards/edit.html). [code editing benchmark](/docs/leaderboards/edit.html).
This more clearly distinguishes This more clearly distinguishes
the performance of the performance of
today's strongest coding models and today's strongest coding models and
leaves headroom for future LLMs. leaves headroom for future LLMs.
{: .note :}
See the main
[aider leaderboard](https://aider.chat/docs/leaderboards/)
for benchmark results from more models.
This article only contains a snapshot
of results at the time of publication.
## The polyglot benchmark ## The polyglot benchmark
Like aider's original code editing benchmark, Like aider's original code editing benchmark,
@ -171,7 +178,7 @@ on GitHub.
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %} {% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
{% for row in edit_sorted %} {% for row in edit_sorted %}
<tr style="border-bottom: 1px solid #ddd;"> <tr style="border-bottom: 1px solid #ddd;">
<td style="padding: 8px;">{{ row.model }}</td> <td style="padding: 8px;">{{ row.model }}</td>

View file

@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with
</tbody> </tbody>
</table> </table>
### Aider polyglot benchmark results
<canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas> <canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
<script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script> <script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script> <script>
{% assign data_source = edit_sorted %} {% assign data_source = edit_sorted %}
{% assign pass_rate_field = "pass_rate_2" %} {% assign pass_rate_field = "pass_rate_2" %}
{% assign highlight_model = "xxxxxxxxxxx" %}
{% include leaderboard.js %} {% include leaderboard.js %}
</script> </script>
<style> <style>

View file

@ -7,6 +7,10 @@ docker run \
-v `pwd`/tmp.benchmarks/.:/benchmarks \ -v `pwd`/tmp.benchmarks/.:/benchmarks \
-e OPENAI_API_KEY=$OPENAI_API_KEY \ -e OPENAI_API_KEY=$OPENAI_API_KEY \
-e HISTFILE=/aider/.bash_history \ -e HISTFILE=/aider/.bash_history \
-e PROMPT_COMMAND='history -a' \
-e HISTCONTROL=ignoredups \
-e HISTSIZE=10000 \
-e HISTFILESIZE=20000 \
-e AIDER_DOCKER=1 \ -e AIDER_DOCKER=1 \
-e AIDER_BENCHMARK_DIR=/benchmarks \ -e AIDER_BENCHMARK_DIR=/benchmarks \
aider-benchmark \ aider-benchmark \

View file

@ -25,7 +25,7 @@ rsync -avz --delete \
"$REPO_ROOT/" \ "$REPO_ROOT/" \
"$DEST:~/aider/" "$DEST:~/aider/"
rsync -a .env .bash_history .gitignore "$DEST:~/aider/." rsync -a .env .gitignore "$DEST:~/aider/."
rsync -a ~/dotfiles/screenrc "$DEST:.screenrc" rsync -a ~/dotfiles/screenrc "$DEST:.screenrc"