copy

2025-05-29 08:44:59 +00:00 · 2025-01-24 08:22:13 -08:00 · 2025-01-24 08:22:13 -08:00 · d7bb80468b
commit d7bb80468b
parent 9d6a692054
3 changed files with 153 additions and 3 deletions
--- a/aider/models.py
+++ b/aider/models.py
@ -841,7 +841,7 @@ MODEL_SETTINGS = [
        use_repo_map=True,
        streaming=False,
        use_temperature=False,
-        # extra_params=dict(extra_body=dict(reasoning_effort="high")),
+        extra_params=dict(extra_body=dict(reasoning_effort="high")),
    ),
    ModelSettings(
        "openrouter/qwen/qwen-2.5-coder-32b-instruct",
--- a/aider/website/_data/r1_architect.yml
+++ b/aider/website/_data/r1_architect.yml
@ -0,0 +1,138 @@
+
+
+
+- dirname: 2025-01-23-19-14-48--r1-architect-sonnet
+  test_cases: 225
+  model: R1+Sonnet
+  edit_format: architect
+  commit_hash: 05a77c7
+  editor_model: claude-3-5-sonnet-20241022
+  editor_edit_format: editor-diff
+  pass_rate_1: 27.1
+  pass_rate_2: 64.0
+  pass_num_1: 61
+  pass_num_2: 144
+  percent_cases_well_formed: 100.0
+  error_outputs: 2
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 392
+  lazy_comments: 6
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 5
+  total_tests: 225
+  command: aider --model deepseek/deepseek-reasoner
+  date: 2025-01-23
+  versions: 0.72.3.dev
+  seconds_per_case: 251.6
+  total_cost: 13.2933
+
+- dirname: 2025-01-20-19-11-38--ds-turns-upd-cur-msgs-fix-with-summarizer
+  test_cases: 225
+  model: R1
+  edit_format: diff
+  commit_hash: 5650697-dirty
+  pass_rate_1: 26.7
+  pass_rate_2: 56.9
+  pass_num_1: 60
+  pass_num_2: 128
+  percent_cases_well_formed: 96.9
+  error_outputs: 8
+  num_malformed_responses: 7
+  num_with_malformed_responses: 7
+  user_asks: 15
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 5
+  total_tests: 225
+  command: aider --model deepseek/deepseek-reasoner
+  date: 2025-01-20
+  versions: 0.71.2.dev
+  seconds_per_case: 113.7
+  total_cost: 5.4193
+
+
+- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
+  test_cases: 224
+  model: o1
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 23.7
+  pass_rate_2: 61.7
+  pass_num_1: 53
+  pass_num_2: 139
+  percent_cases_well_formed: 91.5
+  error_outputs: 25
+  num_malformed_responses: 24
+  num_with_malformed_responses: 19
+  user_asks: 16
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  total_tests: 225
+  command: aider --model openrouter/openai/o1
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 133.2
+  total_cost: 186.4958
+
+
+- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
+  test_cases: 225
+  model: DeepSeek V3
+  edit_format: diff
+  commit_hash: 0a23c4a-dirty
+  pass_rate_1: 22.7
+  pass_rate_2: 48.4
+  pass_num_1: 51
+  pass_num_2: 109
+  percent_cases_well_formed: 98.7
+  error_outputs: 7
+  num_malformed_responses: 7
+  num_with_malformed_responses: 3
+  user_asks: 19
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 8
+  total_tests: 225
+  command: aider --model deepseek/deepseek-chat
+  date: 2024-12-25
+  versions: 0.69.2.dev
+  seconds_per_case: 34.8
+  total_cost: 0.3369
+
+
+
+- dirname: 2025-01-17-19-44-33--sonnet-baseline-jan-17
+  test_cases: 225
+  model: Sonnet
+  edit_format: diff
+  commit_hash: 6451d59
+  pass_rate_1: 22.2
+  pass_rate_2: 51.6
+  pass_num_1: 50
+  pass_num_2: 116
+  percent_cases_well_formed: 99.6
+  error_outputs: 2
+  num_malformed_responses: 1
+  num_with_malformed_responses: 1
+  user_asks: 11
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 8
+  total_tests: 225
+  command: aider --model claude-3-5-sonnet-20241022
+  date: 2025-01-17
+  versions: 0.71.2.dev
+  seconds_per_case: 21.4
+  total_cost: 14.4063
--- a/aider/website/_posts/2025-01-23-r1.md
+++ b/aider/website/_posts/2025-01-23-r1.md
@ -1,5 +1,5 @@
 ---
-title: r1 tops aider's polyglot leaderboard
+title: R1+Sonnet set SOTA on aider's polyglot benchmark
 #excerpt: o1 scores the top result on aider's new multi-language, more challenging coding benchmark.
 #highlight_image: /assets/o1-polyglot.jpg
 draft: false
@ -9,12 +9,24 @@ nav_exclude: true
 <p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
 {% endif %}

-# r1 tops aider's polyglot leaderboard
+# R1+Sonnet set SOTA on aider's polyglot benchmark
 {: .no_toc }

 <canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>

+Aider supports using a pair of models for coding:

+- An Architect model is asked to describe how to solve the coding problem. Thinking/reasoning models often work well in this role.
+- An Editor model is given the Architect's solution and asked to produce specific code editing instructions to apply those changes to existing source files.
+
+**R1 as architect with Sonnet as editor has set a new SOTA of 64.0%** on the 
+[aider polyglot benchmark](/2024/12/21/polyglot.html).
+They achieve this at **14X less cost** compared to the previous o1 SOTA result.
+
+Using o1 or R1 as architect with various other editor models didn't produce significantly
+better results than using them alone.
+This is in contrast to the first wave of thinking models like o1-preview and o1-mini,
+which improved when paired with many different editor models.


 ## Results