This commit is contained in:
Paul Gauthier 2025-01-24 08:22:13 -08:00
parent 9d6a692054
commit d7bb80468b
3 changed files with 153 additions and 3 deletions

View file

@ -841,7 +841,7 @@ MODEL_SETTINGS = [
use_repo_map=True,
streaming=False,
use_temperature=False,
# extra_params=dict(extra_body=dict(reasoning_effort="high")),
extra_params=dict(extra_body=dict(reasoning_effort="high")),
),
ModelSettings(
"openrouter/qwen/qwen-2.5-coder-32b-instruct",

View file

@ -0,0 +1,138 @@
- dirname: 2025-01-23-19-14-48--r1-architect-sonnet
test_cases: 225
model: R1+Sonnet
edit_format: architect
commit_hash: 05a77c7
editor_model: claude-3-5-sonnet-20241022
editor_edit_format: editor-diff
pass_rate_1: 27.1
pass_rate_2: 64.0
pass_num_1: 61
pass_num_2: 144
percent_cases_well_formed: 100.0
error_outputs: 2
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 392
lazy_comments: 6
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-reasoner
date: 2025-01-23
versions: 0.72.3.dev
seconds_per_case: 251.6
total_cost: 13.2933
- dirname: 2025-01-20-19-11-38--ds-turns-upd-cur-msgs-fix-with-summarizer
test_cases: 225
model: R1
edit_format: diff
commit_hash: 5650697-dirty
pass_rate_1: 26.7
pass_rate_2: 56.9
pass_num_1: 60
pass_num_2: 128
percent_cases_well_formed: 96.9
error_outputs: 8
num_malformed_responses: 7
num_with_malformed_responses: 7
user_asks: 15
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-reasoner
date: 2025-01-20
versions: 0.71.2.dev
seconds_per_case: 113.7
total_cost: 5.4193
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
test_cases: 224
model: o1
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 23.7
pass_rate_2: 61.7
pass_num_1: 53
pass_num_2: 139
percent_cases_well_formed: 91.5
error_outputs: 25
num_malformed_responses: 24
num_with_malformed_responses: 19
user_asks: 16
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model openrouter/openai/o1
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 133.2
total_cost: 186.4958
- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
test_cases: 225
model: DeepSeek V3
edit_format: diff
commit_hash: 0a23c4a-dirty
pass_rate_1: 22.7
pass_rate_2: 48.4
pass_num_1: 51
pass_num_2: 109
percent_cases_well_formed: 98.7
error_outputs: 7
num_malformed_responses: 7
num_with_malformed_responses: 3
user_asks: 19
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 8
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-25
versions: 0.69.2.dev
seconds_per_case: 34.8
total_cost: 0.3369
- dirname: 2025-01-17-19-44-33--sonnet-baseline-jan-17
test_cases: 225
model: Sonnet
edit_format: diff
commit_hash: 6451d59
pass_rate_1: 22.2
pass_rate_2: 51.6
pass_num_1: 50
pass_num_2: 116
percent_cases_well_formed: 99.6
error_outputs: 2
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 11
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 8
total_tests: 225
command: aider --model claude-3-5-sonnet-20241022
date: 2025-01-17
versions: 0.71.2.dev
seconds_per_case: 21.4
total_cost: 14.4063

View file

@ -1,5 +1,5 @@
---
title: r1 tops aider's polyglot leaderboard
title: R1+Sonnet set SOTA on aider's polyglot benchmark
#excerpt: o1 scores the top result on aider's new multi-language, more challenging coding benchmark.
#highlight_image: /assets/o1-polyglot.jpg
draft: false
@ -9,12 +9,24 @@ nav_exclude: true
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
{% endif %}
# r1 tops aider's polyglot leaderboard
# R1+Sonnet set SOTA on aider's polyglot benchmark
{: .no_toc }
<canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
Aider supports using a pair of models for coding:
- An Architect model is asked to describe how to solve the coding problem. Thinking/reasoning models often work well in this role.
- An Editor model is given the Architect's solution and asked to produce specific code editing instructions to apply those changes to existing source files.
**R1 as architect with Sonnet as editor has set a new SOTA of 64.0%** on the
[aider polyglot benchmark](/2024/12/21/polyglot.html).
They achieve this at **14X less cost** compared to the previous o1 SOTA result.
Using o1 or R1 as architect with various other editor models didn't produce significantly
better results than using them alone.
This is in contrast to the first wave of thinking models like o1-preview and o1-mini,
which improved when paired with many different editor models.
## Results