mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-29 08:44:59 +00:00
copy
This commit is contained in:
parent
9d6a692054
commit
d7bb80468b
3 changed files with 153 additions and 3 deletions
|
@ -841,7 +841,7 @@ MODEL_SETTINGS = [
|
|||
use_repo_map=True,
|
||||
streaming=False,
|
||||
use_temperature=False,
|
||||
# extra_params=dict(extra_body=dict(reasoning_effort="high")),
|
||||
extra_params=dict(extra_body=dict(reasoning_effort="high")),
|
||||
),
|
||||
ModelSettings(
|
||||
"openrouter/qwen/qwen-2.5-coder-32b-instruct",
|
||||
|
|
138
aider/website/_data/r1_architect.yml
Normal file
138
aider/website/_data/r1_architect.yml
Normal file
|
@ -0,0 +1,138 @@
|
|||
|
||||
|
||||
|
||||
- dirname: 2025-01-23-19-14-48--r1-architect-sonnet
|
||||
test_cases: 225
|
||||
model: R1+Sonnet
|
||||
edit_format: architect
|
||||
commit_hash: 05a77c7
|
||||
editor_model: claude-3-5-sonnet-20241022
|
||||
editor_edit_format: editor-diff
|
||||
pass_rate_1: 27.1
|
||||
pass_rate_2: 64.0
|
||||
pass_num_1: 61
|
||||
pass_num_2: 144
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 392
|
||||
lazy_comments: 6
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 5
|
||||
total_tests: 225
|
||||
command: aider --model deepseek/deepseek-reasoner
|
||||
date: 2025-01-23
|
||||
versions: 0.72.3.dev
|
||||
seconds_per_case: 251.6
|
||||
total_cost: 13.2933
|
||||
|
||||
- dirname: 2025-01-20-19-11-38--ds-turns-upd-cur-msgs-fix-with-summarizer
|
||||
test_cases: 225
|
||||
model: R1
|
||||
edit_format: diff
|
||||
commit_hash: 5650697-dirty
|
||||
pass_rate_1: 26.7
|
||||
pass_rate_2: 56.9
|
||||
pass_num_1: 60
|
||||
pass_num_2: 128
|
||||
percent_cases_well_formed: 96.9
|
||||
error_outputs: 8
|
||||
num_malformed_responses: 7
|
||||
num_with_malformed_responses: 7
|
||||
user_asks: 15
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 5
|
||||
total_tests: 225
|
||||
command: aider --model deepseek/deepseek-reasoner
|
||||
date: 2025-01-20
|
||||
versions: 0.71.2.dev
|
||||
seconds_per_case: 113.7
|
||||
total_cost: 5.4193
|
||||
|
||||
|
||||
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
|
||||
test_cases: 224
|
||||
model: o1
|
||||
edit_format: diff
|
||||
commit_hash: a755079-dirty
|
||||
pass_rate_1: 23.7
|
||||
pass_rate_2: 61.7
|
||||
pass_num_1: 53
|
||||
pass_num_2: 139
|
||||
percent_cases_well_formed: 91.5
|
||||
error_outputs: 25
|
||||
num_malformed_responses: 24
|
||||
num_with_malformed_responses: 19
|
||||
user_asks: 16
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
total_tests: 225
|
||||
command: aider --model openrouter/openai/o1
|
||||
date: 2024-12-21
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 133.2
|
||||
total_cost: 186.4958
|
||||
|
||||
|
||||
- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
|
||||
test_cases: 225
|
||||
model: DeepSeek V3
|
||||
edit_format: diff
|
||||
commit_hash: 0a23c4a-dirty
|
||||
pass_rate_1: 22.7
|
||||
pass_rate_2: 48.4
|
||||
pass_num_1: 51
|
||||
pass_num_2: 109
|
||||
percent_cases_well_formed: 98.7
|
||||
error_outputs: 7
|
||||
num_malformed_responses: 7
|
||||
num_with_malformed_responses: 3
|
||||
user_asks: 19
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 8
|
||||
total_tests: 225
|
||||
command: aider --model deepseek/deepseek-chat
|
||||
date: 2024-12-25
|
||||
versions: 0.69.2.dev
|
||||
seconds_per_case: 34.8
|
||||
total_cost: 0.3369
|
||||
|
||||
|
||||
|
||||
- dirname: 2025-01-17-19-44-33--sonnet-baseline-jan-17
|
||||
test_cases: 225
|
||||
model: Sonnet
|
||||
edit_format: diff
|
||||
commit_hash: 6451d59
|
||||
pass_rate_1: 22.2
|
||||
pass_rate_2: 51.6
|
||||
pass_num_1: 50
|
||||
pass_num_2: 116
|
||||
percent_cases_well_formed: 99.6
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 1
|
||||
num_with_malformed_responses: 1
|
||||
user_asks: 11
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 8
|
||||
total_tests: 225
|
||||
command: aider --model claude-3-5-sonnet-20241022
|
||||
date: 2025-01-17
|
||||
versions: 0.71.2.dev
|
||||
seconds_per_case: 21.4
|
||||
total_cost: 14.4063
|
|
@ -1,5 +1,5 @@
|
|||
---
|
||||
title: r1 tops aider's polyglot leaderboard
|
||||
title: R1+Sonnet set SOTA on aider's polyglot benchmark
|
||||
#excerpt: o1 scores the top result on aider's new multi-language, more challenging coding benchmark.
|
||||
#highlight_image: /assets/o1-polyglot.jpg
|
||||
draft: false
|
||||
|
@ -9,12 +9,24 @@ nav_exclude: true
|
|||
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
|
||||
{% endif %}
|
||||
|
||||
# r1 tops aider's polyglot leaderboard
|
||||
# R1+Sonnet set SOTA on aider's polyglot benchmark
|
||||
{: .no_toc }
|
||||
|
||||
<canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
|
||||
|
||||
Aider supports using a pair of models for coding:
|
||||
|
||||
- An Architect model is asked to describe how to solve the coding problem. Thinking/reasoning models often work well in this role.
|
||||
- An Editor model is given the Architect's solution and asked to produce specific code editing instructions to apply those changes to existing source files.
|
||||
|
||||
**R1 as architect with Sonnet as editor has set a new SOTA of 64.0%** on the
|
||||
[aider polyglot benchmark](/2024/12/21/polyglot.html).
|
||||
They achieve this at **14X less cost** compared to the previous o1 SOTA result.
|
||||
|
||||
Using o1 or R1 as architect with various other editor models didn't produce significantly
|
||||
better results than using them alone.
|
||||
This is in contrast to the first wave of thinking models like o1-preview and o1-mini,
|
||||
which improved when paired with many different editor models.
|
||||
|
||||
|
||||
## Results
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue