diff --git a/aider/models.py b/aider/models.py index 97575993b..6e91d6aab 100644 --- a/aider/models.py +++ b/aider/models.py @@ -841,7 +841,7 @@ MODEL_SETTINGS = [ use_repo_map=True, streaming=False, use_temperature=False, - # extra_params=dict(extra_body=dict(reasoning_effort="high")), + extra_params=dict(extra_body=dict(reasoning_effort="high")), ), ModelSettings( "openrouter/qwen/qwen-2.5-coder-32b-instruct", diff --git a/aider/website/_data/r1_architect.yml b/aider/website/_data/r1_architect.yml new file mode 100644 index 000000000..c036c7de8 --- /dev/null +++ b/aider/website/_data/r1_architect.yml @@ -0,0 +1,138 @@ + + + +- dirname: 2025-01-23-19-14-48--r1-architect-sonnet + test_cases: 225 + model: R1+Sonnet + edit_format: architect + commit_hash: 05a77c7 + editor_model: claude-3-5-sonnet-20241022 + editor_edit_format: editor-diff + pass_rate_1: 27.1 + pass_rate_2: 64.0 + pass_num_1: 61 + pass_num_2: 144 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 392 + lazy_comments: 6 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 5 + total_tests: 225 + command: aider --model deepseek/deepseek-reasoner + date: 2025-01-23 + versions: 0.72.3.dev + seconds_per_case: 251.6 + total_cost: 13.2933 + +- dirname: 2025-01-20-19-11-38--ds-turns-upd-cur-msgs-fix-with-summarizer + test_cases: 225 + model: R1 + edit_format: diff + commit_hash: 5650697-dirty + pass_rate_1: 26.7 + pass_rate_2: 56.9 + pass_num_1: 60 + pass_num_2: 128 + percent_cases_well_formed: 96.9 + error_outputs: 8 + num_malformed_responses: 7 + num_with_malformed_responses: 7 + user_asks: 15 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 5 + total_tests: 225 + command: aider --model deepseek/deepseek-reasoner + date: 2025-01-20 + versions: 0.71.2.dev + seconds_per_case: 113.7 + total_cost: 5.4193 + + +- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff + test_cases: 224 + model: o1 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 23.7 + pass_rate_2: 61.7 + pass_num_1: 53 + pass_num_2: 139 + percent_cases_well_formed: 91.5 + error_outputs: 25 + num_malformed_responses: 24 + num_with_malformed_responses: 19 + user_asks: 16 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + total_tests: 225 + command: aider --model openrouter/openai/o1 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 133.2 + total_cost: 186.4958 + + +- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2 + test_cases: 225 + model: DeepSeek V3 + edit_format: diff + commit_hash: 0a23c4a-dirty + pass_rate_1: 22.7 + pass_rate_2: 48.4 + pass_num_1: 51 + pass_num_2: 109 + percent_cases_well_formed: 98.7 + error_outputs: 7 + num_malformed_responses: 7 + num_with_malformed_responses: 3 + user_asks: 19 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 8 + total_tests: 225 + command: aider --model deepseek/deepseek-chat + date: 2024-12-25 + versions: 0.69.2.dev + seconds_per_case: 34.8 + total_cost: 0.3369 + + + +- dirname: 2025-01-17-19-44-33--sonnet-baseline-jan-17 + test_cases: 225 + model: Sonnet + edit_format: diff + commit_hash: 6451d59 + pass_rate_1: 22.2 + pass_rate_2: 51.6 + pass_num_1: 50 + pass_num_2: 116 + percent_cases_well_formed: 99.6 + error_outputs: 2 + num_malformed_responses: 1 + num_with_malformed_responses: 1 + user_asks: 11 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 8 + total_tests: 225 + command: aider --model claude-3-5-sonnet-20241022 + date: 2025-01-17 + versions: 0.71.2.dev + seconds_per_case: 21.4 + total_cost: 14.4063 diff --git a/aider/website/_posts/2025-01-23-r1.md b/aider/website/_posts/2025-01-23-r1.md index 1198cc8c5..264887e33 100644 --- a/aider/website/_posts/2025-01-23-r1.md +++ b/aider/website/_posts/2025-01-23-r1.md @@ -1,5 +1,5 @@ --- -title: r1 tops aider's polyglot leaderboard +title: R1+Sonnet set SOTA on aider's polyglot benchmark #excerpt: o1 scores the top result on aider's new multi-language, more challenging coding benchmark. #highlight_image: /assets/o1-polyglot.jpg draft: false @@ -9,12 +9,24 @@ nav_exclude: true

{{ page.date | date: "%B %d, %Y" }}

{% endif %} -# r1 tops aider's polyglot leaderboard +# R1+Sonnet set SOTA on aider's polyglot benchmark {: .no_toc } +Aider supports using a pair of models for coding: +- An Architect model is asked to describe how to solve the coding problem. Thinking/reasoning models often work well in this role. +- An Editor model is given the Architect's solution and asked to produce specific code editing instructions to apply those changes to existing source files. + +**R1 as architect with Sonnet as editor has set a new SOTA of 64.0%** on the +[aider polyglot benchmark](/2024/12/21/polyglot.html). +They achieve this at **14X less cost** compared to the previous o1 SOTA result. + +Using o1 or R1 as architect with various other editor models didn't produce significantly +better results than using them alone. +This is in contrast to the first wave of thinking models like o1-preview and o1-mini, +which improved when paired with many different editor models. ## Results