This commit is contained in:
Paul Gauthier 2024-09-26 11:21:35 -07:00
parent d375103b64
commit b3e3a5a401
2 changed files with 420 additions and 40 deletions

View file

@ -0,0 +1,418 @@
- dirname: 2024-09-25-21-17-19--senior-sonnet-sonnet-diff
test_cases: 133
model: claude-3.5-sonnet
junior_model: claude-3.5-sonnet
junior_edit_format: diff
edit_format: senior
commit_hash: c18d6a8-dirty
pass_rate_1: 62.4
pass_rate_2: 80.5
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 183
lazy_comments: 6
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 25.1
total_cost: 4.9502
- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
test_cases: 133
model: claude-3.5-sonnet
edit_format: diff
commit_hash: 35f21b5
pass_rate_1: 57.1
pass_rate_2: 77.4
percent_cases_well_formed: 99.2
error_outputs: 23
released: 2024-06-20
num_malformed_responses: 4
num_with_malformed_responses: 1
user_asks: 2
lazy_comments: 0
syntax_errors: 1
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --sonnet
date: 2024-07-04
versions: 0.42.1-dev
seconds_per_case: 17.6
total_cost: 3.6346
- dirname: 2024-09-25-21-25-01--senior-o1mini-4o-jr-diff
test_cases: 133
model: o1-mini
junior_model: gpt-4o
junior_edit_format: diff
edit_format: senior
commit_hash: 3f682ed-dirty, 25e833b
pass_rate_1: 51.1
pass_rate_2: 70.7
percent_cases_well_formed: 100.0
error_outputs: 12
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 214
lazy_comments: 6
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model o1-mini
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 23.7
total_cost: 9.3158
- dirname: 2024-09-26-15-05-58--senior-o1mini-deep-jr-whole
test_cases: 133
model: o1-mini
edit_format: senior
commit_hash: 1676653-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 51.9
pass_rate_2: 71.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 199
lazy_comments: 11
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model o1-mini
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 48.2
total_cost: 5.6069
- dirname: 2024-09-25-21-33-40--senior-4o-4o-jr-diff
test_cases: 133
model: gpt-4o
junior_model: gpt-4o
junior_edit_format: diff
edit_format: senior
commit_hash: 9f3cd92
pass_rate_1: 56.4
pass_rate_2: 75.2
percent_cases_well_formed: 100.0
error_outputs: 13
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 207
lazy_comments: 8
syntax_errors: 1
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model gpt-4o
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 18.2
total_cost: 6.0918
- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers
test_cases: 133
model: o1-preview
edit_format: diff
commit_hash: 5493654-dirty
pass_rate_1: 57.9
pass_rate_2: 79.7
percent_cases_well_formed: 93.2
error_outputs: 11
num_malformed_responses: 11
num_with_malformed_responses: 9
user_asks: 3
lazy_comments: 0
syntax_errors: 10
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model o1-preview
date: 2024-09-21
versions: 0.56.1.dev
seconds_per_case: 80.9
total_cost: 63.9190
- dirname: 2024-09-25-21-39-05--senior-o1preview-4o-jr-diff
test_cases: 133
model: o1-preview
junior_model: gpt-4o
junior_edit_format: diff
edit_format: senior
commit_hash: 9f3cd92
pass_rate_1: 63.2
pass_rate_2: 80.5
percent_cases_well_formed: 100.0
error_outputs: 23
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 191
lazy_comments: 2
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 4
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 42.3
total_cost: 39.3766
- dirname: 2024-09-25-21-52-42--senior-o1preview-sonnet-jr-diff
test_cases: 133
model: o1-preview
junior_model: claude-3.5-sonnet
junior_edit_format: diff
edit_format: senior
commit_hash: 9f3cd92
junior_model: claude-3-5-sonnet
pass_rate_1: 60.9
pass_rate_2: 82.7
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 180
lazy_comments: 3
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 44.9
total_cost: 37.6192
- dirname: 2024-09-21-16-40-56--o1-mini-flex-sr-markers
test_cases: 36
model: o1-mini
edit_format: diff
commit_hash: 5493654
pass_rate_1: 50.0
pass_rate_2: 61.1
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 3
lazy_comments: 0
syntax_errors: 0
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model o1-mini
date: 2024-09-21
versions: 0.56.1.dev
seconds_per_case: 26.7
total_cost: 2.4226
- dirname: 2024-09-25-23-12-14--senior-o1mini-deep-jr-diff
test_cases: 133
model: o1-mini
edit_format: senior
commit_hash: 9f3cd92-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 48.9
pass_rate_2: 69.2
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 202
lazy_comments: 12
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model o1-mini
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 52.2
total_cost: 5.7927
- dirname: 2024-09-25-23-18-16--senior-o1preview-deep-jr-diff
test_cases: 133
model: o1-preview
edit_format: senior
commit_hash: 9f3cd92-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 64.7
pass_rate_2: 80.5
percent_cases_well_formed: 100.0
error_outputs: 5
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 180
lazy_comments: 2
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 73.2
total_cost: 35.7887
- dirname: 2024-09-25-23-30-36--senior-o1preview-deep-jr-whole
test_cases: 133
model: o1-preview
edit_format: senior
commit_hash: 9f3cd92-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 63.9
pass_rate_2: 85.0
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 181
lazy_comments: 12
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 67.4
total_cost: 35.3152
- dirname: 2024-09-26-15-15-17--senior-sonnet-deep-jr-whole
test_cases: 133
model: claude-3.5-sonnet
edit_format: senior
commit_hash: bc1559f-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 61.7
pass_rate_2: 78.9
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 184
lazy_comments: 5
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 37.2
total_cost: 2.1510
- dirname: 2024-09-26-15-33-28--costs-gpt4o-diff
test_cases: 133
model: gpt-4o
edit_format: diff
commit_hash: 89aa385-dirty
pass_rate_1: 55.6
pass_rate_2: 71.4
percent_cases_well_formed: 97.7
error_outputs: 5
num_malformed_responses: 5
num_with_malformed_responses: 3
user_asks: 10
lazy_comments: 0
syntax_errors: 0
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4o
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 9.7
total_cost: 3.8088
- dirname: 2024-09-26-15-41-08--senior-4o-deep-jr-whole
test_cases: 133
model: gpt-4o
edit_format: senior
commit_hash: 89aa385-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 60.9
pass_rate_2: 73.7
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 187
lazy_comments: 12
syntax_errors: 5
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 38.0
total_cost: 2.4737
- dirname: 2024-09-26-15-54-08--senior-4o-deep-jr-diff
test_cases: 133
model: gpt-4o
edit_format: senior
commit_hash: 89aa385-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 57.1
pass_rate_2: 74.4
percent_cases_well_formed: 100.0
error_outputs: 4
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 192
lazy_comments: 6
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model gpt-4o
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 44.0
total_cost: 2.5498
- dirname: 2024-09-26-16-06-39--senior-sonnet-deep-jr-diff
test_cases: 133
model: claude-3.5-sonnet
edit_format: senior
commit_hash: 89aa385-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 61.7
pass_rate_2: 78.9
percent_cases_well_formed: 100.0
error_outputs: 2
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 184
lazy_comments: 2
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 43.2
total_cost: 2.1488

View file

@ -18,7 +18,7 @@ Aider now has experimental support for using two models to complete each coding
Splitting up "code reasoning" and "code editing" has produced SOTA results on Splitting up "code reasoning" and "code editing" has produced SOTA results on
[aider's code editing benchmark](/docs/benchmarks.html#the-benchmark). [aider's code editing benchmark](/docs/benchmarks.html#the-benchmark).
Both Sonnet and o1-preview exceed the preivous SOTA when using this Both Sonnet and o1-preview exceed the previous SOTA when using this
new Senior/Junior approach. new Senior/Junior approach.
The best result was obtained with The best result was obtained with
o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 85%! o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 85%!
@ -70,48 +70,10 @@ o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 8
{% for item in group.items %} {% for item in group.items %}
labels.push("{{ item.junior_model | default: "(No Junior)" }} {{ item.junior_edit_format | default: item.edit_format }}"); labels.push("{{ item.junior_model | default: "(No Junior)" }} {{ item.junior_edit_format | default: item.edit_format }}");
data.push({{ item.pass_rate_2 }}); data.push({{ item.pass_rate_2 }});
var bgColor = colorMapping["{{ item.model }}"]; backgroundColors.push(colorMapping["{{ item.model }}"]);
if ("{{ item.junior_model }}" === "deepseek") {
if ("{{ item.junior_edit_format }}" === "whole") {
bgColor = createStripedPattern(bgColor);
} else if ("{{ item.junior_edit_format }}" === "diff") {
bgColor = createPolkaDotPattern(bgColor);
}
}
backgroundColors.push(bgColor);
borderColors.push(borderColorMapping["{{ item.model }}"]); borderColors.push(borderColorMapping["{{ item.model }}"]);
{% endfor %} {% endfor %}
{% endfor %} {% endfor %}
function createStripedPattern(color) {
var canvas = document.createElement('canvas');
var ctx = canvas.getContext('2d');
canvas.width = 10;
canvas.height = 10;
ctx.fillStyle = color;
ctx.fillRect(0, 0, 10, 10);
ctx.strokeStyle = 'white';
ctx.lineWidth = 2;
ctx.beginPath();
ctx.moveTo(0, 0);
ctx.lineTo(10, 10);
ctx.stroke();
return ctx.createPattern(canvas, 'repeat');
}
function createPolkaDotPattern(color) {
var canvas = document.createElement('canvas');
var ctx = canvas.getContext('2d');
canvas.width = 10;
canvas.height = 10;
ctx.fillStyle = color;
ctx.fillRect(0, 0, 10, 10);
ctx.fillStyle = 'white';
ctx.beginPath();
ctx.arc(5, 5, 2, 0, Math.PI * 2);
ctx.fill();
return ctx.createPattern(canvas, 'repeat');
}
new Chart(ctx, { new Chart(ctx, {
type: 'bar', type: 'bar',
data: { data: {