This commit is contained in:
Paul Gauthier 2024-09-26 11:21:35 -07:00
parent d375103b64
commit b3e3a5a401
2 changed files with 420 additions and 40 deletions

View file

@ -0,0 +1,418 @@
- dirname: 2024-09-25-21-17-19--senior-sonnet-sonnet-diff
test_cases: 133
model: claude-3.5-sonnet
junior_model: claude-3.5-sonnet
junior_edit_format: diff
edit_format: senior
commit_hash: c18d6a8-dirty
pass_rate_1: 62.4
pass_rate_2: 80.5
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 183
lazy_comments: 6
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 25.1
total_cost: 4.9502
- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
test_cases: 133
model: claude-3.5-sonnet
edit_format: diff
commit_hash: 35f21b5
pass_rate_1: 57.1
pass_rate_2: 77.4
percent_cases_well_formed: 99.2
error_outputs: 23
released: 2024-06-20
num_malformed_responses: 4
num_with_malformed_responses: 1
user_asks: 2
lazy_comments: 0
syntax_errors: 1
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --sonnet
date: 2024-07-04
versions: 0.42.1-dev
seconds_per_case: 17.6
total_cost: 3.6346
- dirname: 2024-09-25-21-25-01--senior-o1mini-4o-jr-diff
test_cases: 133
model: o1-mini
junior_model: gpt-4o
junior_edit_format: diff
edit_format: senior
commit_hash: 3f682ed-dirty, 25e833b
pass_rate_1: 51.1
pass_rate_2: 70.7
percent_cases_well_formed: 100.0
error_outputs: 12
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 214
lazy_comments: 6
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model o1-mini
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 23.7
total_cost: 9.3158
- dirname: 2024-09-26-15-05-58--senior-o1mini-deep-jr-whole
test_cases: 133
model: o1-mini
edit_format: senior
commit_hash: 1676653-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 51.9
pass_rate_2: 71.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 199
lazy_comments: 11
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model o1-mini
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 48.2
total_cost: 5.6069
- dirname: 2024-09-25-21-33-40--senior-4o-4o-jr-diff
test_cases: 133
model: gpt-4o
junior_model: gpt-4o
junior_edit_format: diff
edit_format: senior
commit_hash: 9f3cd92
pass_rate_1: 56.4
pass_rate_2: 75.2
percent_cases_well_formed: 100.0
error_outputs: 13
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 207
lazy_comments: 8
syntax_errors: 1
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model gpt-4o
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 18.2
total_cost: 6.0918
- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers
test_cases: 133
model: o1-preview
edit_format: diff
commit_hash: 5493654-dirty
pass_rate_1: 57.9
pass_rate_2: 79.7
percent_cases_well_formed: 93.2
error_outputs: 11
num_malformed_responses: 11
num_with_malformed_responses: 9
user_asks: 3
lazy_comments: 0
syntax_errors: 10
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model o1-preview
date: 2024-09-21
versions: 0.56.1.dev
seconds_per_case: 80.9
total_cost: 63.9190
- dirname: 2024-09-25-21-39-05--senior-o1preview-4o-jr-diff
test_cases: 133
model: o1-preview
junior_model: gpt-4o
junior_edit_format: diff
edit_format: senior
commit_hash: 9f3cd92
pass_rate_1: 63.2
pass_rate_2: 80.5
percent_cases_well_formed: 100.0
error_outputs: 23
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 191
lazy_comments: 2
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 4
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 42.3
total_cost: 39.3766
- dirname: 2024-09-25-21-52-42--senior-o1preview-sonnet-jr-diff
test_cases: 133
model: o1-preview
junior_model: claude-3.5-sonnet
junior_edit_format: diff
edit_format: senior
commit_hash: 9f3cd92
junior_model: claude-3-5-sonnet
pass_rate_1: 60.9
pass_rate_2: 82.7
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 180
lazy_comments: 3
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 44.9
total_cost: 37.6192
- dirname: 2024-09-21-16-40-56--o1-mini-flex-sr-markers
test_cases: 36
model: o1-mini
edit_format: diff
commit_hash: 5493654
pass_rate_1: 50.0
pass_rate_2: 61.1
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 3
lazy_comments: 0
syntax_errors: 0
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model o1-mini
date: 2024-09-21
versions: 0.56.1.dev
seconds_per_case: 26.7
total_cost: 2.4226
- dirname: 2024-09-25-23-12-14--senior-o1mini-deep-jr-diff
test_cases: 133
model: o1-mini
edit_format: senior
commit_hash: 9f3cd92-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 48.9
pass_rate_2: 69.2
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 202
lazy_comments: 12
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model o1-mini
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 52.2
total_cost: 5.7927
- dirname: 2024-09-25-23-18-16--senior-o1preview-deep-jr-diff
test_cases: 133
model: o1-preview
edit_format: senior
commit_hash: 9f3cd92-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 64.7
pass_rate_2: 80.5
percent_cases_well_formed: 100.0
error_outputs: 5
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 180
lazy_comments: 2
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 73.2
total_cost: 35.7887
- dirname: 2024-09-25-23-30-36--senior-o1preview-deep-jr-whole
test_cases: 133
model: o1-preview
edit_format: senior
commit_hash: 9f3cd92-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 63.9
pass_rate_2: 85.0
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 181
lazy_comments: 12
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model o1-preview
date: 2024-09-25
versions: 0.57.2.dev
seconds_per_case: 67.4
total_cost: 35.3152
- dirname: 2024-09-26-15-15-17--senior-sonnet-deep-jr-whole
test_cases: 133
model: claude-3.5-sonnet
edit_format: senior
commit_hash: bc1559f-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 61.7
pass_rate_2: 78.9
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 184
lazy_comments: 5
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 37.2
total_cost: 2.1510
- dirname: 2024-09-26-15-33-28--costs-gpt4o-diff
test_cases: 133
model: gpt-4o
edit_format: diff
commit_hash: 89aa385-dirty
pass_rate_1: 55.6
pass_rate_2: 71.4
percent_cases_well_formed: 97.7
error_outputs: 5
num_malformed_responses: 5
num_with_malformed_responses: 3
user_asks: 10
lazy_comments: 0
syntax_errors: 0
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4o
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 9.7
total_cost: 3.8088
- dirname: 2024-09-26-15-41-08--senior-4o-deep-jr-whole
test_cases: 133
model: gpt-4o
edit_format: senior
commit_hash: 89aa385-dirty
junior_model: deepseek
junior_edit_format: whole
pass_rate_1: 60.9
pass_rate_2: 73.7
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 187
lazy_comments: 12
syntax_errors: 5
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 38.0
total_cost: 2.4737
- dirname: 2024-09-26-15-54-08--senior-4o-deep-jr-diff
test_cases: 133
model: gpt-4o
edit_format: senior
commit_hash: 89aa385-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 57.1
pass_rate_2: 74.4
percent_cases_well_formed: 100.0
error_outputs: 4
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 192
lazy_comments: 6
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model gpt-4o
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 44.0
total_cost: 2.5498
- dirname: 2024-09-26-16-06-39--senior-sonnet-deep-jr-diff
test_cases: 133
model: claude-3.5-sonnet
edit_format: senior
commit_hash: 89aa385-dirty
junior_model: deepseek
junior_edit_format: diff
pass_rate_1: 61.7
pass_rate_2: 78.9
percent_cases_well_formed: 100.0
error_outputs: 2
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 184
lazy_comments: 2
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-09-26
versions: 0.57.2.dev
seconds_per_case: 43.2
total_cost: 2.1488

View file

@ -18,7 +18,7 @@ Aider now has experimental support for using two models to complete each coding
Splitting up "code reasoning" and "code editing" has produced SOTA results on
[aider's code editing benchmark](/docs/benchmarks.html#the-benchmark).
Both Sonnet and o1-preview exceed the preivous SOTA when using this
Both Sonnet and o1-preview exceed the previous SOTA when using this
new Senior/Junior approach.
The best result was obtained with
o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 85%!
@ -70,48 +70,10 @@ o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 8
{% for item in group.items %}
labels.push("{{ item.junior_model | default: "(No Junior)" }} {{ item.junior_edit_format | default: item.edit_format }}");
data.push({{ item.pass_rate_2 }});
var bgColor = colorMapping["{{ item.model }}"];
if ("{{ item.junior_model }}" === "deepseek") {
if ("{{ item.junior_edit_format }}" === "whole") {
bgColor = createStripedPattern(bgColor);
} else if ("{{ item.junior_edit_format }}" === "diff") {
bgColor = createPolkaDotPattern(bgColor);
}
}
backgroundColors.push(bgColor);
backgroundColors.push(colorMapping["{{ item.model }}"]);
borderColors.push(borderColorMapping["{{ item.model }}"]);
{% endfor %}
{% endfor %}
function createStripedPattern(color) {
var canvas = document.createElement('canvas');
var ctx = canvas.getContext('2d');
canvas.width = 10;
canvas.height = 10;
ctx.fillStyle = color;
ctx.fillRect(0, 0, 10, 10);
ctx.strokeStyle = 'white';
ctx.lineWidth = 2;
ctx.beginPath();
ctx.moveTo(0, 0);
ctx.lineTo(10, 10);
ctx.stroke();
return ctx.createPattern(canvas, 'repeat');
}
function createPolkaDotPattern(color) {
var canvas = document.createElement('canvas');
var ctx = canvas.getContext('2d');
canvas.width = 10;
canvas.height = 10;
ctx.fillStyle = color;
ctx.fillRect(0, 0, 10, 10);
ctx.fillStyle = 'white';
ctx.beginPath();
ctx.arc(5, 5, 2, 0, Math.PI * 2);
ctx.fill();
return ctx.createPattern(canvas, 'repeat');
}
new Chart(ctx, {
type: 'bar',
data: {