better

2025-06-01 10:14:59 +00:00 · 2024-09-26 11:21:35 -07:00 · 2024-09-26 11:21:35 -07:00 · b3e3a5a401
commit b3e3a5a401
parent d375103b64
2 changed files with 420 additions and 40 deletions
--- a/aider/website/_data/senior.yml
+++ b/aider/website/_data/senior.yml
@ -0,0 +1,418 @@
+- dirname: 2024-09-25-21-17-19--senior-sonnet-sonnet-diff
+  test_cases: 133
+  model: claude-3.5-sonnet
+  junior_model: claude-3.5-sonnet
+  junior_edit_format: diff
+  edit_format: senior
+  commit_hash: c18d6a8-dirty
+  pass_rate_1: 62.4
+  pass_rate_2: 80.5
+  percent_cases_well_formed: 100.0
+  error_outputs: 3
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 183
+  lazy_comments: 6
+  syntax_errors: 9
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  command: aider --model openrouter/anthropic/claude-3.5-sonnet
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 25.1
+  total_cost: 4.9502
+  
+- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
+  test_cases: 133
+  model: claude-3.5-sonnet
+  edit_format: diff
+  commit_hash: 35f21b5
+  pass_rate_1: 57.1
+  pass_rate_2: 77.4
+  percent_cases_well_formed: 99.2
+  error_outputs: 23
+  released: 2024-06-20
+  num_malformed_responses: 4
+  num_with_malformed_responses: 1
+  user_asks: 2
+  lazy_comments: 0
+  syntax_errors: 1
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --sonnet
+  date: 2024-07-04
+  versions: 0.42.1-dev
+  seconds_per_case: 17.6
+  total_cost: 3.6346
+    
+- dirname: 2024-09-25-21-25-01--senior-o1mini-4o-jr-diff
+  test_cases: 133
+  model: o1-mini
+  junior_model: gpt-4o
+  junior_edit_format: diff
+  edit_format: senior
+  commit_hash: 3f682ed-dirty, 25e833b
+  pass_rate_1: 51.1
+  pass_rate_2: 70.7
+  percent_cases_well_formed: 100.0
+  error_outputs: 12
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 214
+  lazy_comments: 6
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model o1-mini
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 23.7
+  total_cost: 9.3158
+
+- dirname: 2024-09-26-15-05-58--senior-o1mini-deep-jr-whole
+  test_cases: 133
+  model: o1-mini
+  edit_format: senior
+  commit_hash: 1676653-dirty
+  junior_model: deepseek
+  junior_edit_format: whole
+  pass_rate_1: 51.9
+  pass_rate_2: 71.4
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 199
+  lazy_comments: 11
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  command: aider --model o1-mini
+  date: 2024-09-26
+  versions: 0.57.2.dev
+  seconds_per_case: 48.2
+  total_cost: 5.6069
+                                                
+- dirname: 2024-09-25-21-33-40--senior-4o-4o-jr-diff
+  test_cases: 133
+  model: gpt-4o
+  junior_model: gpt-4o
+  junior_edit_format: diff
+  edit_format: senior
+  commit_hash: 9f3cd92
+  pass_rate_1: 56.4
+  pass_rate_2: 75.2
+  percent_cases_well_formed: 100.0
+  error_outputs: 13
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 207
+  lazy_comments: 8
+  syntax_errors: 1
+  indentation_errors: 1
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  command: aider --model gpt-4o
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 18.2
+  total_cost: 6.0918
+  
+- dirname: 2024-09-21-16-45-11--o1-preview-flex-sr-markers
+  test_cases: 133
+  model: o1-preview
+  edit_format: diff
+  commit_hash: 5493654-dirty
+  pass_rate_1: 57.9
+  pass_rate_2: 79.7
+  percent_cases_well_formed: 93.2
+  error_outputs: 11
+  num_malformed_responses: 11
+  num_with_malformed_responses: 9
+  user_asks: 3
+  lazy_comments: 0
+  syntax_errors: 10
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model o1-preview
+  date: 2024-09-21
+  versions: 0.56.1.dev
+  seconds_per_case: 80.9
+  total_cost: 63.9190
+  
+- dirname: 2024-09-25-21-39-05--senior-o1preview-4o-jr-diff
+  test_cases: 133
+  model: o1-preview
+  junior_model: gpt-4o
+  junior_edit_format: diff
+  edit_format: senior
+  commit_hash: 9f3cd92
+  pass_rate_1: 63.2
+  pass_rate_2: 80.5
+  percent_cases_well_formed: 100.0
+  error_outputs: 23
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 191
+  lazy_comments: 2
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 4
+  command: aider --model o1-preview
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 42.3
+  total_cost: 39.3766
+  
+- dirname: 2024-09-25-21-52-42--senior-o1preview-sonnet-jr-diff
+  test_cases: 133
+  model: o1-preview
+  junior_model: claude-3.5-sonnet
+  junior_edit_format: diff
+  edit_format: senior
+  commit_hash: 9f3cd92
+  junior_model: claude-3-5-sonnet
+  pass_rate_1: 60.9
+  pass_rate_2: 82.7
+  percent_cases_well_formed: 100.0
+  error_outputs: 1
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 180
+  lazy_comments: 3
+  syntax_errors: 9
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  command: aider --model o1-preview
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 44.9
+  total_cost: 37.6192
+  
+- dirname: 2024-09-21-16-40-56--o1-mini-flex-sr-markers
+  test_cases: 36
+  model: o1-mini
+  edit_format: diff
+  commit_hash: 5493654
+  pass_rate_1: 50.0
+  pass_rate_2: 61.1
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 3
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 1
+  exhausted_context_windows: 0
+  test_timeouts: 0
+  command: aider --model o1-mini
+  date: 2024-09-21
+  versions: 0.56.1.dev
+  seconds_per_case: 26.7
+  total_cost: 2.4226
+                                            
+- dirname: 2024-09-25-23-12-14--senior-o1mini-deep-jr-diff
+  test_cases: 133
+  model: o1-mini
+  edit_format: senior
+  commit_hash: 9f3cd92-dirty
+  junior_model: deepseek
+  junior_edit_format: diff
+  pass_rate_1: 48.9
+  pass_rate_2: 69.2
+  percent_cases_well_formed: 100.0
+  error_outputs: 1
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 202
+  lazy_comments: 12
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  command: aider --model o1-mini
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 52.2
+  total_cost: 5.7927
+  
+- dirname: 2024-09-25-23-18-16--senior-o1preview-deep-jr-diff
+  test_cases: 133
+  model: o1-preview
+  edit_format: senior
+  commit_hash: 9f3cd92-dirty
+  junior_model: deepseek
+  junior_edit_format: diff
+  pass_rate_1: 64.7
+  pass_rate_2: 80.5
+  percent_cases_well_formed: 100.0
+  error_outputs: 5
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 180
+  lazy_comments: 2
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model o1-preview
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 73.2
+  total_cost: 35.7887
+  
+- dirname: 2024-09-25-23-30-36--senior-o1preview-deep-jr-whole
+  test_cases: 133
+  model: o1-preview
+  edit_format: senior
+  commit_hash: 9f3cd92-dirty
+  junior_model: deepseek
+  junior_edit_format: whole
+  pass_rate_1: 63.9
+  pass_rate_2: 85.0
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 181
+  lazy_comments: 12
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  command: aider --model o1-preview
+  date: 2024-09-25
+  versions: 0.57.2.dev
+  seconds_per_case: 67.4
+  total_cost: 35.3152
+
+- dirname: 2024-09-26-15-15-17--senior-sonnet-deep-jr-whole
+  test_cases: 133
+  model: claude-3.5-sonnet
+  edit_format: senior
+  commit_hash: bc1559f-dirty
+  junior_model: deepseek
+  junior_edit_format: whole
+  pass_rate_1: 61.7
+  pass_rate_2: 78.9
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 184
+  lazy_comments: 5
+  syntax_errors: 9
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  command: aider --model openrouter/anthropic/claude-3.5-sonnet
+  date: 2024-09-26
+  versions: 0.57.2.dev
+  seconds_per_case: 37.2
+  total_cost: 2.1510
+
+- dirname: 2024-09-26-15-33-28--costs-gpt4o-diff
+  test_cases: 133
+  model: gpt-4o
+  edit_format: diff
+  commit_hash: 89aa385-dirty
+  pass_rate_1: 55.6
+  pass_rate_2: 71.4
+  percent_cases_well_formed: 97.7
+  error_outputs: 5
+  num_malformed_responses: 5
+  num_with_malformed_responses: 3
+  user_asks: 10
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 1
+  exhausted_context_windows: 0
+  test_timeouts: 0
+  command: aider --model gpt-4o
+  date: 2024-09-26
+  versions: 0.57.2.dev
+  seconds_per_case: 9.7
+  total_cost: 3.8088
+
+- dirname: 2024-09-26-15-41-08--senior-4o-deep-jr-whole
+  test_cases: 133
+  model: gpt-4o
+  edit_format: senior
+  commit_hash: 89aa385-dirty
+  junior_model: deepseek
+  junior_edit_format: whole
+  pass_rate_1: 60.9
+  pass_rate_2: 73.7
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 187
+  lazy_comments: 12
+  syntax_errors: 5
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model gpt-4o
+  date: 2024-09-26
+  versions: 0.57.2.dev
+  seconds_per_case: 38.0
+  total_cost: 2.4737
+
+- dirname: 2024-09-26-15-54-08--senior-4o-deep-jr-diff
+  test_cases: 133
+  model: gpt-4o
+  edit_format: senior
+  commit_hash: 89aa385-dirty
+  junior_model: deepseek
+  junior_edit_format: diff
+  pass_rate_1: 57.1
+  pass_rate_2: 74.4
+  percent_cases_well_formed: 100.0
+  error_outputs: 4
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 192
+  lazy_comments: 6
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  command: aider --model gpt-4o
+  date: 2024-09-26
+  versions: 0.57.2.dev
+  seconds_per_case: 44.0
+  total_cost: 2.5498
+
+- dirname: 2024-09-26-16-06-39--senior-sonnet-deep-jr-diff
+  test_cases: 133
+  model: claude-3.5-sonnet
+  edit_format: senior
+  commit_hash: 89aa385-dirty
+  junior_model: deepseek
+  junior_edit_format: diff
+  pass_rate_1: 61.7
+  pass_rate_2: 78.9
+  percent_cases_well_formed: 100.0
+  error_outputs: 2
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 184
+  lazy_comments: 2
+  syntax_errors: 9
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  command: aider --model openrouter/anthropic/claude-3.5-sonnet
+  date: 2024-09-26
+  versions: 0.57.2.dev
+  seconds_per_case: 43.2
+  total_cost: 2.1488
--- a/aider/website/_posts/2024-09-26-senior-junior.md
+++ b/aider/website/_posts/2024-09-26-senior-junior.md
@ -18,7 +18,7 @@ Aider now has experimental support for using two models to complete each coding

 Splitting up "code reasoning" and "code editing" has produced SOTA results on
 [aider's code editing benchmark](/docs/benchmarks.html#the-benchmark).
-Both Sonnet and o1-preview exceed the preivous SOTA when using this
+Both Sonnet and o1-preview exceed the previous SOTA when using this
 new Senior/Junior approach.
 The best result was obtained with
 o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 85%!
@ -70,48 +70,10 @@ o1-preview as Senior and Deepseek as Junior, raising the SOTA from 79.7% up to 8
      {% for item in group.items %}
        labels.push("{{ item.junior_model | default: "(No Junior)" }} {{ item.junior_edit_format | default: item.edit_format }}");
        data.push({{ item.pass_rate_2 }});
-        var bgColor = colorMapping["{{ item.model }}"];
-        if ("{{ item.junior_model }}" === "deepseek") {
-          if ("{{ item.junior_edit_format }}" === "whole") {
-            bgColor = createStripedPattern(bgColor);
-          } else if ("{{ item.junior_edit_format }}" === "diff") {
-            bgColor = createPolkaDotPattern(bgColor);
-          }
-        }
-        backgroundColors.push(bgColor);
+        backgroundColors.push(colorMapping["{{ item.model }}"]);
        borderColors.push(borderColorMapping["{{ item.model }}"]);
      {% endfor %}
    {% endfor %}
-
-    function createStripedPattern(color) {
-      var canvas = document.createElement('canvas');
-      var ctx = canvas.getContext('2d');
-      canvas.width = 10;
-      canvas.height = 10;
-      ctx.fillStyle = color;
-      ctx.fillRect(0, 0, 10, 10);
-      ctx.strokeStyle = 'white';
-      ctx.lineWidth = 2;
-      ctx.beginPath();
-      ctx.moveTo(0, 0);
-      ctx.lineTo(10, 10);
-      ctx.stroke();
-      return ctx.createPattern(canvas, 'repeat');
-    }
-
-    function createPolkaDotPattern(color) {
-      var canvas = document.createElement('canvas');
-      var ctx = canvas.getContext('2d');
-      canvas.width = 10;
-      canvas.height = 10;
-      ctx.fillStyle = color;
-      ctx.fillRect(0, 0, 10, 10);
-      ctx.fillStyle = 'white';
-      ctx.beginPath();
-      ctx.arc(5, 5, 2, 0, Math.PI * 2);
-      ctx.fill();
-      return ctx.createPattern(canvas, 'repeat');
-    }
    new Chart(ctx, {
      type: 'bar',
      data: {