diff --git a/aider/website/_data/senior.yml b/aider/website/_data/senior.yml new file mode 100644 index 000000000..253790657 --- /dev/null +++ b/aider/website/_data/senior.yml @@ -0,0 +1,418 @@ +- dirname: 2024-09-25-21-17-19--senior-sonnet-sonnet-diff + test_cases: 133 + model: claude-3.5-sonnet + junior_model: claude-3.5-sonnet + junior_edit_format: diff + edit_format: senior + commit_hash: c18d6a8-dirty + pass_rate_1: 62.4 + pass_rate_2: 80.5 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 183 + lazy_comments: 6 + syntax_errors: 9 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model openrouter/anthropic/claude-3.5-sonnet + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 25.1 + total_cost: 4.9502 + +- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue + test_cases: 133 + model: claude-3.5-sonnet + edit_format: diff + commit_hash: 35f21b5 + pass_rate_1: 57.1 + pass_rate_2: 77.4 + percent_cases_well_formed: 99.2 + error_outputs: 23 + released: 2024-06-20 + num_malformed_responses: 4 + num_with_malformed_responses: 1 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --sonnet + date: 2024-07-04 + versions: 0.42.1-dev + seconds_per_case: 17.6 + total_cost: 3.6346 + +- dirname: 2024-09-25-21-25-01--senior-o1mini-4o-jr-diff + test_cases: 133 + model: o1-mini + junior_model: gpt-4o + junior_edit_format: diff + edit_format: senior + commit_hash: 3f682ed-dirty, 25e833b + pass_rate_1: 51.1 + pass_rate_2: 70.7 + percent_cases_well_formed: 100.0 + error_outputs: 12 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 214 + lazy_comments: 6 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model o1-mini + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 23.7 + total_cost: 9.3158 + +- dirname: 2024-09-26-15-05-58--senior-o1mini-deep-jr-whole + test_cases: 133 + model: o1-mini + edit_format: senior + commit_hash: 1676653-dirty + junior_model: deepseek + junior_edit_format: whole + pass_rate_1: 51.9 + pass_rate_2: 71.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 199 + lazy_comments: 11 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model o1-mini + date: 2024-09-26 + versions: 0.57.2.dev + seconds_per_case: 48.2 + total_cost: 5.6069 + +- dirname: 2024-09-25-21-33-40--senior-4o-4o-jr-diff + test_cases: 133 + model: gpt-4o + junior_model: gpt-4o + junior_edit_format: diff + edit_format: senior + commit_hash: 9f3cd92 + pass_rate_1: 56.4 + pass_rate_2: 75.2 + percent_cases_well_formed: 100.0 + error_outputs: 13 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 207 + lazy_comments: 8 + syntax_errors: 1 + indentation_errors: 1 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model gpt-4o + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 18.2 + total_cost: 6.0918 + +- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers + test_cases: 133 + model: o1-preview + edit_format: diff + commit_hash: 5493654-dirty + pass_rate_1: 57.9 + pass_rate_2: 79.7 + percent_cases_well_formed: 93.2 + error_outputs: 11 + num_malformed_responses: 11 + num_with_malformed_responses: 9 + user_asks: 3 + lazy_comments: 0 + syntax_errors: 10 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model o1-preview + date: 2024-09-21 + versions: 0.56.1.dev + seconds_per_case: 80.9 + total_cost: 63.9190 + +- dirname: 2024-09-25-21-39-05--senior-o1preview-4o-jr-diff + test_cases: 133 + model: o1-preview + junior_model: gpt-4o + junior_edit_format: diff + edit_format: senior + commit_hash: 9f3cd92 + pass_rate_1: 63.2 + pass_rate_2: 80.5 + percent_cases_well_formed: 100.0 + error_outputs: 23 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 191 + lazy_comments: 2 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 4 + command: aider --model o1-preview + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 42.3 + total_cost: 39.3766 + +- dirname: 2024-09-25-21-52-42--senior-o1preview-sonnet-jr-diff + test_cases: 133 + model: o1-preview + junior_model: claude-3.5-sonnet + junior_edit_format: diff + edit_format: senior + commit_hash: 9f3cd92 + junior_model: claude-3-5-sonnet + pass_rate_1: 60.9 + pass_rate_2: 82.7 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 180 + lazy_comments: 3 + syntax_errors: 9 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model o1-preview + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 44.9 + total_cost: 37.6192 + +- dirname: 2024-09-21-16-40-56--o1-mini-flex-sr-markers + test_cases: 36 + model: o1-mini + edit_format: diff + commit_hash: 5493654 + pass_rate_1: 50.0 + pass_rate_2: 61.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 3 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 1 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model o1-mini + date: 2024-09-21 + versions: 0.56.1.dev + seconds_per_case: 26.7 + total_cost: 2.4226 + +- dirname: 2024-09-25-23-12-14--senior-o1mini-deep-jr-diff + test_cases: 133 + model: o1-mini + edit_format: senior + commit_hash: 9f3cd92-dirty + junior_model: deepseek + junior_edit_format: diff + pass_rate_1: 48.9 + pass_rate_2: 69.2 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 202 + lazy_comments: 12 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model o1-mini + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 52.2 + total_cost: 5.7927 + +- dirname: 2024-09-25-23-18-16--senior-o1preview-deep-jr-diff + test_cases: 133 + model: o1-preview + edit_format: senior + commit_hash: 9f3cd92-dirty + junior_model: deepseek + junior_edit_format: diff + pass_rate_1: 64.7 + pass_rate_2: 80.5 + percent_cases_well_formed: 100.0 + error_outputs: 5 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 180 + lazy_comments: 2 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model o1-preview + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 73.2 + total_cost: 35.7887 + +- dirname: 2024-09-25-23-30-36--senior-o1preview-deep-jr-whole + test_cases: 133 + model: o1-preview + edit_format: senior + commit_hash: 9f3cd92-dirty + junior_model: deepseek + junior_edit_format: whole + pass_rate_1: 63.9 + pass_rate_2: 85.0 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 181 + lazy_comments: 12 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model o1-preview + date: 2024-09-25 + versions: 0.57.2.dev + seconds_per_case: 67.4 + total_cost: 35.3152 + +- dirname: 2024-09-26-15-15-17--senior-sonnet-deep-jr-whole + test_cases: 133 + model: claude-3.5-sonnet + edit_format: senior + commit_hash: bc1559f-dirty + junior_model: deepseek + junior_edit_format: whole + pass_rate_1: 61.7 + pass_rate_2: 78.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 184 + lazy_comments: 5 + syntax_errors: 9 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model openrouter/anthropic/claude-3.5-sonnet + date: 2024-09-26 + versions: 0.57.2.dev + seconds_per_case: 37.2 + total_cost: 2.1510 + +- dirname: 2024-09-26-15-33-28--costs-gpt4o-diff + test_cases: 133 + model: gpt-4o + edit_format: diff + commit_hash: 89aa385-dirty + pass_rate_1: 55.6 + pass_rate_2: 71.4 + percent_cases_well_formed: 97.7 + error_outputs: 5 + num_malformed_responses: 5 + num_with_malformed_responses: 3 + user_asks: 10 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 1 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o + date: 2024-09-26 + versions: 0.57.2.dev + seconds_per_case: 9.7 + total_cost: 3.8088 + +- dirname: 2024-09-26-15-41-08--senior-4o-deep-jr-whole + test_cases: 133 + model: gpt-4o + edit_format: senior + commit_hash: 89aa385-dirty + junior_model: deepseek + junior_edit_format: whole + pass_rate_1: 60.9 + pass_rate_2: 73.7 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 187 + lazy_comments: 12 + syntax_errors: 5 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o + date: 2024-09-26 + versions: 0.57.2.dev + seconds_per_case: 38.0 + total_cost: 2.4737 + +- dirname: 2024-09-26-15-54-08--senior-4o-deep-jr-diff + test_cases: 133 + model: gpt-4o + edit_format: senior + commit_hash: 89aa385-dirty + junior_model: deepseek + junior_edit_format: diff + pass_rate_1: 57.1 + pass_rate_2: 74.4 + percent_cases_well_formed: 100.0 + error_outputs: 4 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 192 + lazy_comments: 6 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model gpt-4o + date: 2024-09-26 + versions: 0.57.2.dev + seconds_per_case: 44.0 + total_cost: 2.5498 + +- dirname: 2024-09-26-16-06-39--senior-sonnet-deep-jr-diff + test_cases: 133 + model: claude-3.5-sonnet + edit_format: senior + commit_hash: 89aa385-dirty + junior_model: deepseek + junior_edit_format: diff + pass_rate_1: 61.7 + pass_rate_2: 78.9 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 184 + lazy_comments: 2 + syntax_errors: 9 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model openrouter/anthropic/claude-3.5-sonnet + date: 2024-09-26 + versions: 0.57.2.dev + seconds_per_case: 43.2 + total_cost: 2.1488 \ No newline at end of file diff --git a/aider/website/_posts/2024-09-26-senior-junior.md b/aider/website/_posts/2024-09-26-senior-junior.md index b34ed8456..abebcbfdf 100644 --- a/aider/website/_posts/2024-09-26-senior-junior.md +++ b/aider/website/_posts/2024-09-26-senior-junior.md @@ -18,7 +18,7 @@ Aider now has experimental support for using two models to complete each coding Splitting up "code reasoning" and "code editing" has produced SOTA results on [aider's code editing benchmark](/docs/benchmarks.html#the-benchmark). -Both Sonnet and o1-preview exceed the preivous SOTA when using this +Both Sonnet and o1-preview exceed the previous SOTA when using this new Senior/Junior approach. The best result was obtained with o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 85%! @@ -70,48 +70,10 @@ o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 8 {% for item in group.items %} labels.push("{{ item.junior_model | default: "(No Junior)" }} {{ item.junior_edit_format | default: item.edit_format }}"); data.push({{ item.pass_rate_2 }}); - var bgColor = colorMapping["{{ item.model }}"]; - if ("{{ item.junior_model }}" === "deepseek") { - if ("{{ item.junior_edit_format }}" === "whole") { - bgColor = createStripedPattern(bgColor); - } else if ("{{ item.junior_edit_format }}" === "diff") { - bgColor = createPolkaDotPattern(bgColor); - } - } - backgroundColors.push(bgColor); + backgroundColors.push(colorMapping["{{ item.model }}"]); borderColors.push(borderColorMapping["{{ item.model }}"]); {% endfor %} {% endfor %} - - function createStripedPattern(color) { - var canvas = document.createElement('canvas'); - var ctx = canvas.getContext('2d'); - canvas.width = 10; - canvas.height = 10; - ctx.fillStyle = color; - ctx.fillRect(0, 0, 10, 10); - ctx.strokeStyle = 'white'; - ctx.lineWidth = 2; - ctx.beginPath(); - ctx.moveTo(0, 0); - ctx.lineTo(10, 10); - ctx.stroke(); - return ctx.createPattern(canvas, 'repeat'); - } - - function createPolkaDotPattern(color) { - var canvas = document.createElement('canvas'); - var ctx = canvas.getContext('2d'); - canvas.width = 10; - canvas.height = 10; - ctx.fillStyle = color; - ctx.fillRect(0, 0, 10, 10); - ctx.fillStyle = 'white'; - ctx.beginPath(); - ctx.arc(5, 5, 2, 0, Math.PI * 2); - ctx.fill(); - return ctx.createPattern(canvas, 'repeat'); - } new Chart(ctx, { type: 'bar', data: {