mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 10:14:59 +00:00
better
This commit is contained in:
parent
d375103b64
commit
b3e3a5a401
2 changed files with 420 additions and 40 deletions
418
aider/website/_data/senior.yml
Normal file
418
aider/website/_data/senior.yml
Normal file
|
@ -0,0 +1,418 @@
|
|||
- dirname: 2024-09-25-21-17-19--senior-sonnet-sonnet-diff
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
junior_model: claude-3.5-sonnet
|
||||
junior_edit_format: diff
|
||||
edit_format: senior
|
||||
commit_hash: c18d6a8-dirty
|
||||
pass_rate_1: 62.4
|
||||
pass_rate_2: 80.5
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 3
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 183
|
||||
lazy_comments: 6
|
||||
syntax_errors: 9
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model openrouter/anthropic/claude-3.5-sonnet
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 25.1
|
||||
total_cost: 4.9502
|
||||
|
||||
- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: diff
|
||||
commit_hash: 35f21b5
|
||||
pass_rate_1: 57.1
|
||||
pass_rate_2: 77.4
|
||||
percent_cases_well_formed: 99.2
|
||||
error_outputs: 23
|
||||
released: 2024-06-20
|
||||
num_malformed_responses: 4
|
||||
num_with_malformed_responses: 1
|
||||
user_asks: 2
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --sonnet
|
||||
date: 2024-07-04
|
||||
versions: 0.42.1-dev
|
||||
seconds_per_case: 17.6
|
||||
total_cost: 3.6346
|
||||
|
||||
- dirname: 2024-09-25-21-25-01--senior-o1mini-4o-jr-diff
|
||||
test_cases: 133
|
||||
model: o1-mini
|
||||
junior_model: gpt-4o
|
||||
junior_edit_format: diff
|
||||
edit_format: senior
|
||||
commit_hash: 3f682ed-dirty, 25e833b
|
||||
pass_rate_1: 51.1
|
||||
pass_rate_2: 70.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 12
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 214
|
||||
lazy_comments: 6
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model o1-mini
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 23.7
|
||||
total_cost: 9.3158
|
||||
|
||||
- dirname: 2024-09-26-15-05-58--senior-o1mini-deep-jr-whole
|
||||
test_cases: 133
|
||||
model: o1-mini
|
||||
edit_format: senior
|
||||
commit_hash: 1676653-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: whole
|
||||
pass_rate_1: 51.9
|
||||
pass_rate_2: 71.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 199
|
||||
lazy_comments: 11
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model o1-mini
|
||||
date: 2024-09-26
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 48.2
|
||||
total_cost: 5.6069
|
||||
|
||||
- dirname: 2024-09-25-21-33-40--senior-4o-4o-jr-diff
|
||||
test_cases: 133
|
||||
model: gpt-4o
|
||||
junior_model: gpt-4o
|
||||
junior_edit_format: diff
|
||||
edit_format: senior
|
||||
commit_hash: 9f3cd92
|
||||
pass_rate_1: 56.4
|
||||
pass_rate_2: 75.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 13
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 207
|
||||
lazy_comments: 8
|
||||
syntax_errors: 1
|
||||
indentation_errors: 1
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model gpt-4o
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 18.2
|
||||
total_cost: 6.0918
|
||||
|
||||
- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers
|
||||
test_cases: 133
|
||||
model: o1-preview
|
||||
edit_format: diff
|
||||
commit_hash: 5493654-dirty
|
||||
pass_rate_1: 57.9
|
||||
pass_rate_2: 79.7
|
||||
percent_cases_well_formed: 93.2
|
||||
error_outputs: 11
|
||||
num_malformed_responses: 11
|
||||
num_with_malformed_responses: 9
|
||||
user_asks: 3
|
||||
lazy_comments: 0
|
||||
syntax_errors: 10
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model o1-preview
|
||||
date: 2024-09-21
|
||||
versions: 0.56.1.dev
|
||||
seconds_per_case: 80.9
|
||||
total_cost: 63.9190
|
||||
|
||||
- dirname: 2024-09-25-21-39-05--senior-o1preview-4o-jr-diff
|
||||
test_cases: 133
|
||||
model: o1-preview
|
||||
junior_model: gpt-4o
|
||||
junior_edit_format: diff
|
||||
edit_format: senior
|
||||
commit_hash: 9f3cd92
|
||||
pass_rate_1: 63.2
|
||||
pass_rate_2: 80.5
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 23
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 191
|
||||
lazy_comments: 2
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 4
|
||||
command: aider --model o1-preview
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 42.3
|
||||
total_cost: 39.3766
|
||||
|
||||
- dirname: 2024-09-25-21-52-42--senior-o1preview-sonnet-jr-diff
|
||||
test_cases: 133
|
||||
model: o1-preview
|
||||
junior_model: claude-3.5-sonnet
|
||||
junior_edit_format: diff
|
||||
edit_format: senior
|
||||
commit_hash: 9f3cd92
|
||||
junior_model: claude-3-5-sonnet
|
||||
pass_rate_1: 60.9
|
||||
pass_rate_2: 82.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 180
|
||||
lazy_comments: 3
|
||||
syntax_errors: 9
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model o1-preview
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 44.9
|
||||
total_cost: 37.6192
|
||||
|
||||
- dirname: 2024-09-21-16-40-56--o1-mini-flex-sr-markers
|
||||
test_cases: 36
|
||||
model: o1-mini
|
||||
edit_format: diff
|
||||
commit_hash: 5493654
|
||||
pass_rate_1: 50.0
|
||||
pass_rate_2: 61.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 3
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 1
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model o1-mini
|
||||
date: 2024-09-21
|
||||
versions: 0.56.1.dev
|
||||
seconds_per_case: 26.7
|
||||
total_cost: 2.4226
|
||||
|
||||
- dirname: 2024-09-25-23-12-14--senior-o1mini-deep-jr-diff
|
||||
test_cases: 133
|
||||
model: o1-mini
|
||||
edit_format: senior
|
||||
commit_hash: 9f3cd92-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: diff
|
||||
pass_rate_1: 48.9
|
||||
pass_rate_2: 69.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 202
|
||||
lazy_comments: 12
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model o1-mini
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 52.2
|
||||
total_cost: 5.7927
|
||||
|
||||
- dirname: 2024-09-25-23-18-16--senior-o1preview-deep-jr-diff
|
||||
test_cases: 133
|
||||
model: o1-preview
|
||||
edit_format: senior
|
||||
commit_hash: 9f3cd92-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: diff
|
||||
pass_rate_1: 64.7
|
||||
pass_rate_2: 80.5
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 5
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 180
|
||||
lazy_comments: 2
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model o1-preview
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 73.2
|
||||
total_cost: 35.7887
|
||||
|
||||
- dirname: 2024-09-25-23-30-36--senior-o1preview-deep-jr-whole
|
||||
test_cases: 133
|
||||
model: o1-preview
|
||||
edit_format: senior
|
||||
commit_hash: 9f3cd92-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: whole
|
||||
pass_rate_1: 63.9
|
||||
pass_rate_2: 85.0
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 181
|
||||
lazy_comments: 12
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model o1-preview
|
||||
date: 2024-09-25
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 67.4
|
||||
total_cost: 35.3152
|
||||
|
||||
- dirname: 2024-09-26-15-15-17--senior-sonnet-deep-jr-whole
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: senior
|
||||
commit_hash: bc1559f-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: whole
|
||||
pass_rate_1: 61.7
|
||||
pass_rate_2: 78.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 184
|
||||
lazy_comments: 5
|
||||
syntax_errors: 9
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model openrouter/anthropic/claude-3.5-sonnet
|
||||
date: 2024-09-26
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 37.2
|
||||
total_cost: 2.1510
|
||||
|
||||
- dirname: 2024-09-26-15-33-28--costs-gpt4o-diff
|
||||
test_cases: 133
|
||||
model: gpt-4o
|
||||
edit_format: diff
|
||||
commit_hash: 89aa385-dirty
|
||||
pass_rate_1: 55.6
|
||||
pass_rate_2: 71.4
|
||||
percent_cases_well_formed: 97.7
|
||||
error_outputs: 5
|
||||
num_malformed_responses: 5
|
||||
num_with_malformed_responses: 3
|
||||
user_asks: 10
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 1
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o
|
||||
date: 2024-09-26
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 9.7
|
||||
total_cost: 3.8088
|
||||
|
||||
- dirname: 2024-09-26-15-41-08--senior-4o-deep-jr-whole
|
||||
test_cases: 133
|
||||
model: gpt-4o
|
||||
edit_format: senior
|
||||
commit_hash: 89aa385-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: whole
|
||||
pass_rate_1: 60.9
|
||||
pass_rate_2: 73.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 187
|
||||
lazy_comments: 12
|
||||
syntax_errors: 5
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o
|
||||
date: 2024-09-26
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 38.0
|
||||
total_cost: 2.4737
|
||||
|
||||
- dirname: 2024-09-26-15-54-08--senior-4o-deep-jr-diff
|
||||
test_cases: 133
|
||||
model: gpt-4o
|
||||
edit_format: senior
|
||||
commit_hash: 89aa385-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: diff
|
||||
pass_rate_1: 57.1
|
||||
pass_rate_2: 74.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 4
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 192
|
||||
lazy_comments: 6
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model gpt-4o
|
||||
date: 2024-09-26
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 44.0
|
||||
total_cost: 2.5498
|
||||
|
||||
- dirname: 2024-09-26-16-06-39--senior-sonnet-deep-jr-diff
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: senior
|
||||
commit_hash: 89aa385-dirty
|
||||
junior_model: deepseek
|
||||
junior_edit_format: diff
|
||||
pass_rate_1: 61.7
|
||||
pass_rate_2: 78.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 184
|
||||
lazy_comments: 2
|
||||
syntax_errors: 9
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model openrouter/anthropic/claude-3.5-sonnet
|
||||
date: 2024-09-26
|
||||
versions: 0.57.2.dev
|
||||
seconds_per_case: 43.2
|
||||
total_cost: 2.1488
|
|
@ -18,7 +18,7 @@ Aider now has experimental support for using two models to complete each coding
|
|||
|
||||
Splitting up "code reasoning" and "code editing" has produced SOTA results on
|
||||
[aider's code editing benchmark](/docs/benchmarks.html#the-benchmark).
|
||||
Both Sonnet and o1-preview exceed the preivous SOTA when using this
|
||||
Both Sonnet and o1-preview exceed the previous SOTA when using this
|
||||
new Senior/Junior approach.
|
||||
The best result was obtained with
|
||||
o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 85%!
|
||||
|
@ -70,48 +70,10 @@ o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 8
|
|||
{% for item in group.items %}
|
||||
labels.push("{{ item.junior_model | default: "(No Junior)" }} {{ item.junior_edit_format | default: item.edit_format }}");
|
||||
data.push({{ item.pass_rate_2 }});
|
||||
var bgColor = colorMapping["{{ item.model }}"];
|
||||
if ("{{ item.junior_model }}" === "deepseek") {
|
||||
if ("{{ item.junior_edit_format }}" === "whole") {
|
||||
bgColor = createStripedPattern(bgColor);
|
||||
} else if ("{{ item.junior_edit_format }}" === "diff") {
|
||||
bgColor = createPolkaDotPattern(bgColor);
|
||||
}
|
||||
}
|
||||
backgroundColors.push(bgColor);
|
||||
backgroundColors.push(colorMapping["{{ item.model }}"]);
|
||||
borderColors.push(borderColorMapping["{{ item.model }}"]);
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
|
||||
function createStripedPattern(color) {
|
||||
var canvas = document.createElement('canvas');
|
||||
var ctx = canvas.getContext('2d');
|
||||
canvas.width = 10;
|
||||
canvas.height = 10;
|
||||
ctx.fillStyle = color;
|
||||
ctx.fillRect(0, 0, 10, 10);
|
||||
ctx.strokeStyle = 'white';
|
||||
ctx.lineWidth = 2;
|
||||
ctx.beginPath();
|
||||
ctx.moveTo(0, 0);
|
||||
ctx.lineTo(10, 10);
|
||||
ctx.stroke();
|
||||
return ctx.createPattern(canvas, 'repeat');
|
||||
}
|
||||
|
||||
function createPolkaDotPattern(color) {
|
||||
var canvas = document.createElement('canvas');
|
||||
var ctx = canvas.getContext('2d');
|
||||
canvas.width = 10;
|
||||
canvas.height = 10;
|
||||
ctx.fillStyle = color;
|
||||
ctx.fillRect(0, 0, 10, 10);
|
||||
ctx.fillStyle = 'white';
|
||||
ctx.beginPath();
|
||||
ctx.arc(5, 5, 2, 0, Math.PI * 2);
|
||||
ctx.fill();
|
||||
return ctx.createPattern(canvas, 'repeat');
|
||||
}
|
||||
new Chart(ctx, {
|
||||
type: 'bar',
|
||||
data: {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue