mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 10:14:59 +00:00
moved edit results to yaml
This commit is contained in:
parent
fc3a43ef41
commit
17b5dbe804
3 changed files with 320 additions and 20 deletions
|
@ -1,16 +0,0 @@
|
|||
model,second,first,format,command,version,commits,date
|
||||
claude-3-opus-20240229,68.4,53.4,diff,aider --opus,0.30.1,f4b1797,5/2/24
|
||||
claude-3-sonnet-20240229,54.9,43.6,whole,aider --sonnet,0.25.0,a5f8076,3/6/24
|
||||
Command-R+,29.3,22.6,whole,aider --model command-r-plus,0.28.0,a06c927,4/20/24
|
||||
Deepseek Coder,54.5,47,whole,aider --model openai/deepseek-coder,0.30.1,c07f793,4/29/24
|
||||
gemini-1.5-pro-latest,57.1,45.9,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.32.0,5d32dd7,5/3/24
|
||||
gpt-3.5-turbo-0125,49.6,39.8,whole,aider -3,0.22.0,da14474,2/2/24
|
||||
gpt-3.5-turbo-0301,57.9,50.4,whole,aider --model gpt-3.5-turbo-0301,0.16.4-dev,44388db-dirty,11/6/23
|
||||
gpt-3.5-turbo-0613,50.4,38.3,whole,aider --model gpt-3.5-turbo-0613,0.16.4-dev,93aa497-dirty,11/7/23
|
||||
gpt-3.5-turbo-1106,56.1,45.5,whole,aider --model gpt-3.5-turbo-1106,0.30.1,7b14d77,4/30/24
|
||||
gpt-4-0125-preview,66.2,55.6,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,edcf9b1,1/25/24
|
||||
gpt-4-0314,66.2,50.4,diff,aider --model gpt-4-0314,0.31.2-dev,0d43468,5/4/24
|
||||
gpt-4-0613,67.7,46.6,diff,aider -4,0.18.1,3aa17c4,12/16/23
|
||||
gpt-4-1106-preview,63.2,57.1,udiff,aider,0.31.2-dev,1981105-dirty,5/4/24
|
||||
gpt-4-turbo-2024-04-09,64.4,49.2,diff,aider --gpt-4-turbo,0.30.1,e610e5b,5/1/24
|
||||
Llama3 70B,49.2,38.6,diff,aider --model groq/llama3-70b-8192,0.32.0,b5bb453,5/3/24
|
|
316
_data/edit_leaderboard.yml
Normal file
316
_data/edit_leaderboard.yml
Normal file
|
@ -0,0 +1,316 @@
|
|||
- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
|
||||
test_cases: 133
|
||||
model: openrouter/anthropic/claude-3-opus, claude-3-opus-20240229
|
||||
edit_format: diff
|
||||
commit_hash: f4b1797-dirty, f4b1797
|
||||
pass_rate_1: 53.4
|
||||
pass_rate_2: 68.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --opus
|
||||
date: 2024-05-01
|
||||
versions: 0.30.2-dev
|
||||
seconds_per_case: 32.4
|
||||
total_cost: 13.8395
|
||||
- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
|
||||
test_cases: 133
|
||||
model: claude-3-sonnet-20240229
|
||||
edit_format: whole
|
||||
commit_hash: a5f8076-dirty
|
||||
pass_rate_1: 43.6
|
||||
pass_rate_2: 54.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
user_asks: 1
|
||||
lazy_comments: 1
|
||||
syntax_errors: 2
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 7
|
||||
command: aider --sonnet
|
||||
date: 2024-03-06
|
||||
versions: 0.25.1-dev
|
||||
seconds_per_case: 23.1
|
||||
total_cost: 0.0000
|
||||
- dirname: 2024-04-29-19-17-28--deepseek-coder-whole
|
||||
test_cases: 132
|
||||
model: openai/deepseek-coder
|
||||
edit_format: whole
|
||||
commit_hash: c07f793-dirty
|
||||
pass_rate_1: 47.0
|
||||
pass_rate_2: 54.5
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 2
|
||||
syntax_errors: 13
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model openai/deepseek-coder
|
||||
date: 2024-04-29
|
||||
versions: 0.30.2-dev
|
||||
seconds_per_case: 26.7
|
||||
total_cost: 0.0000
|
||||
- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
|
||||
test_cases: 133
|
||||
model: gemini/gemini-1.5-pro-latest
|
||||
edit_format: diff-fenced
|
||||
commit_hash: 3a48dfb, 5d32dd7
|
||||
pass_rate_1: 45.9
|
||||
pass_rate_2: 57.1
|
||||
percent_cases_well_formed: 87.2
|
||||
error_outputs: 60
|
||||
num_malformed_responses: 17
|
||||
user_asks: 3
|
||||
lazy_comments: 0
|
||||
syntax_errors: 8
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model gemini/gemini-1.5-pro-latest
|
||||
date: 2024-05-03
|
||||
versions: 0.31.2-dev
|
||||
seconds_per_case: 21.3
|
||||
total_cost: 0.0000
|
||||
- dirname: 2024-02-02-02-07-28--exercism-gpt-3.5-turbo-0125-whole
|
||||
test_cases: 133
|
||||
model: gpt-3.5-turbo-0125
|
||||
edit_format: whole
|
||||
commit_hash: da14474
|
||||
pass_rate_1: 39.8
|
||||
pass_rate_2: 49.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 3
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 4
|
||||
command: aider --model gpt-3.5-turbo-0125
|
||||
date: 2024-02-02
|
||||
versions: 0.22.1-dev
|
||||
seconds_per_case: 3.2
|
||||
total_cost: 0.4701
|
||||
- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
|
||||
test_cases: 133
|
||||
model: gpt-3.5-turbo-0301
|
||||
edit_format: whole
|
||||
commit_hash: 44388db-dirty
|
||||
pass_rate_1: 50.4
|
||||
pass_rate_2: 57.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
user_asks: 1
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 8
|
||||
command: aider --model gpt-3.5-turbo-0301
|
||||
date: 2023-11-06
|
||||
versions: 0.16.4-dev
|
||||
seconds_per_case: 6.5
|
||||
total_cost: 0.4822
|
||||
- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
|
||||
test_cases: 133
|
||||
model: gpt-3.5-turbo-0613
|
||||
edit_format: whole
|
||||
commit_hash: 93aa497-dirty
|
||||
pass_rate_1: 38.3
|
||||
pass_rate_2: 50.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
user_asks: 1
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 5
|
||||
command: aider --model gpt-3.5-turbo-0613
|
||||
date: 2023-11-07
|
||||
versions: 0.16.4-dev
|
||||
seconds_per_case: 18.0
|
||||
total_cost: 0.5366
|
||||
- dirname: 2024-04-30-21-40-51--litellm-gpt-3.5-turbo-1106-again
|
||||
test_cases: 132
|
||||
model: gpt-3.5-turbo-1106
|
||||
edit_format: whole
|
||||
commit_hash: 7b14d77
|
||||
pass_rate_1: 45.5
|
||||
pass_rate_2: 56.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
user_asks: 1
|
||||
lazy_comments: 0
|
||||
syntax_errors: 19
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-3.5-turbo-1106
|
||||
date: 2024-04-30
|
||||
versions: 0.30.2-dev
|
||||
seconds_per_case: 5.3
|
||||
total_cost: 0.3261
|
||||
- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
|
||||
test_cases: 133
|
||||
model: gpt-4-0125-preview
|
||||
edit_format: udiff
|
||||
commit_hash: edcf9b1
|
||||
pass_rate_1: 55.6
|
||||
pass_rate_2: 66.2
|
||||
percent_cases_well_formed: 97.7
|
||||
error_outputs: 6
|
||||
num_malformed_responses: 3
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 3
|
||||
indentation_errors: 7
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 4
|
||||
command: aider --model gpt-4-0125-preview
|
||||
date: 2024-01-25
|
||||
versions: 0.22.1-dev
|
||||
seconds_per_case: 44.8
|
||||
total_cost: 14.6428
|
||||
- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
|
||||
test_cases: 133
|
||||
model: gpt-4-0314
|
||||
edit_format: diff
|
||||
commit_hash: 0d43468
|
||||
pass_rate_1: 50.4
|
||||
pass_rate_2: 66.2
|
||||
percent_cases_well_formed: 93.2
|
||||
error_outputs: 28
|
||||
num_malformed_responses: 9
|
||||
user_asks: 1
|
||||
lazy_comments: 3
|
||||
syntax_errors: 9
|
||||
indentation_errors: 7
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model gpt-4-0314
|
||||
date: 2024-05-04
|
||||
versions: 0.31.2-dev
|
||||
seconds_per_case: 19.8
|
||||
total_cost: 16.2689
|
||||
- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
|
||||
test_cases: 133
|
||||
model: gpt-4-0613
|
||||
edit_format: diff
|
||||
commit_hash: 3aa17c4
|
||||
pass_rate_1: 46.6
|
||||
pass_rate_2: 67.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 14
|
||||
num_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model gpt-4-0613
|
||||
date: 2023-12-16
|
||||
versions: 0.18.2-dev
|
||||
seconds_per_case: 33.6
|
||||
total_cost: 17.4657
|
||||
- dirname: 2024-05-04-14-33-15--redo-gpt-4-1106-preview-udiff5
|
||||
test_cases: 133
|
||||
model: gpt-4-1106-preview
|
||||
edit_format: udiff
|
||||
commit_hash: 1981105-dirty
|
||||
pass_rate_1: 57.1
|
||||
pass_rate_2: 63.2
|
||||
percent_cases_well_formed: 94.0
|
||||
error_outputs: 24
|
||||
num_malformed_responses: 8
|
||||
user_asks: 0
|
||||
lazy_comments: 7
|
||||
syntax_errors: 3
|
||||
indentation_errors: 5
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model gpt-4-1106-preview
|
||||
date: 2024-05-04
|
||||
versions: 0.31.2-dev
|
||||
seconds_per_case: 15.6
|
||||
total_cost: 5.9468
|
||||
- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
|
||||
test_cases: 133
|
||||
model: gpt-4-turbo
|
||||
edit_format: udiff
|
||||
commit_hash: e610e5b-dirty
|
||||
pass_rate_1: 48.1
|
||||
pass_rate_2: 63.9
|
||||
percent_cases_well_formed: 97.0
|
||||
error_outputs: 12
|
||||
num_malformed_responses: 4
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 4
|
||||
indentation_errors: 2
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model gpt-4-turbo
|
||||
date: 2024-05-01
|
||||
versions: 0.30.2-dev
|
||||
seconds_per_case: 22.8
|
||||
total_cost: 6.3337
|
||||
- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
|
||||
test_cases: 132
|
||||
model: openrouter/meta-llama/llama-3-70b-instruct
|
||||
edit_format: diff
|
||||
commit_hash: b5bb453
|
||||
pass_rate_1: 38.6
|
||||
pass_rate_2: 49.2
|
||||
percent_cases_well_formed: 73.5
|
||||
error_outputs: 105
|
||||
num_malformed_responses: 35
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 2
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model openrouter/meta-llama/llama-3-70b-instruct
|
||||
date: 2024-05-03
|
||||
versions: 0.31.2-dev
|
||||
seconds_per_case: 14.5
|
||||
total_cost: 0.4311
|
||||
- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
|
||||
test_cases: 133
|
||||
model: command-r-plus
|
||||
edit_format: whole
|
||||
commit_hash: fc3a43e-dirty
|
||||
pass_rate_1: 21.8
|
||||
pass_rate_2: 31.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 1
|
||||
syntax_errors: 5
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 7
|
||||
command: aider --model command-r-plus
|
||||
date: 2024-05-06
|
||||
versions: 0.31.2-dev
|
||||
seconds_per_case: 22.9
|
||||
total_cost: 2.7494
|
||||
|
|
@ -31,13 +31,13 @@ it will work best with models that score well on the benchmarks.
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% assign edit_sorted = site.data.edit_leaderboard | sort: 'second' | reverse %}
|
||||
{% assign edit_sorted = site.data.edit_leaderboard | sort: 'pass_rate_2' | reverse %}
|
||||
{% for row in edit_sorted %}
|
||||
<tr style="border-bottom: 1px solid #ddd;">
|
||||
<td style="padding: 8px;">{{ row.model }}</td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.second }}%</td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.pass_rate_2 }}%</td>
|
||||
<td style="padding: 8px;"><code>{{ row.command }}</code></td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.format }}</td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
|
@ -61,7 +61,7 @@ it will work best with models that score well on the benchmarks.
|
|||
|
||||
{% for row in edit_sorted %}
|
||||
leaderboardData.labels.push('{{ row.model }}');
|
||||
leaderboardData.datasets[0].data.push({{ row.second }});
|
||||
leaderboardData.datasets[0].data.push({{ row.pass_rate_2 }});
|
||||
{% endfor %}
|
||||
|
||||
var leaderboardChart = new Chart(ctx, {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue