mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-02 02:34:59 +00:00
copy
This commit is contained in:
parent
43dd9ef8a5
commit
8776830306
3 changed files with 241 additions and 28 deletions
|
@ -1412,27 +1412,27 @@
|
||||||
use_repo_map: true
|
use_repo_map: true
|
||||||
weak_model_name: openrouter/google/gemini-2.0-flash-001
|
weak_model_name: openrouter/google/gemini-2.0-flash-001
|
||||||
|
|
||||||
- name: openrouter/qwen/qwen3-235b-a22b
|
#- name: openrouter/qwen/qwen3-235b-a22b
|
||||||
system_prompt_prefix: "/no_think"
|
# system_prompt_prefix: "/no_think"
|
||||||
use_temperature: 0.7
|
# use_temperature: 0.7
|
||||||
extra_params:
|
# extra_params:
|
||||||
max_tokens: 24000
|
# max_tokens: 24000
|
||||||
top_p: 0.8
|
# top_p: 0.8
|
||||||
top_k: 20
|
# top_k: 20
|
||||||
min_p: 0.0
|
# min_p: 0.0
|
||||||
temperature: 0.7
|
# temperature: 0.7
|
||||||
extra_body:
|
# extra_body:
|
||||||
provider:
|
# provider:
|
||||||
order: ["Together"]
|
# order: ["Together"]
|
||||||
|
|
||||||
- name: together_ai/Qwen/Qwen3-235B-A22B-fp8-tput
|
#- name: together_ai/Qwen/Qwen3-235B-A22B-fp8-tput
|
||||||
system_prompt_prefix: "/no_think"
|
# system_prompt_prefix: "/no_think"
|
||||||
use_temperature: 0.7
|
# use_temperature: 0.7
|
||||||
reasoning_tag: think
|
# reasoning_tag: think
|
||||||
extra_params:
|
# extra_params:
|
||||||
max_tokens: 24000
|
# max_tokens: 24000
|
||||||
top_p: 0.8
|
# top_p: 0.8
|
||||||
top_k: 20
|
# top_k: 20
|
||||||
min_p: 0.0
|
# min_p: 0.0
|
||||||
temperature: 0.7
|
# temperature: 0.7
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
- dirname: 2025-05-08-03-20-24--qwen3-32b-default
|
- dirname: 2025-05-08-03-20-24--qwen3-32b-default
|
||||||
test_cases: 225
|
test_cases: 225
|
||||||
model: Qwen3 32B
|
model: Qwen3 32B on OpenRouter, all providers, default settings (thinking)
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: aaacee5-dirty, aeaf259
|
commit_hash: aaacee5-dirty, aeaf259
|
||||||
pass_rate_1: 14.2
|
pass_rate_1: 14.2
|
||||||
|
@ -28,7 +28,7 @@
|
||||||
|
|
||||||
- dirname: 2025-05-08-03-22-37--qwen3-235b-defaults
|
- dirname: 2025-05-08-03-22-37--qwen3-235b-defaults
|
||||||
test_cases: 225
|
test_cases: 225
|
||||||
model: Qwen3 235B A22B
|
model: Qwen3 235B A22B on OpenRouter, all providers, default settings (thinking)
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: aaacee5-dirty
|
commit_hash: aaacee5-dirty
|
||||||
pass_rate_1: 17.3
|
pass_rate_1: 17.3
|
||||||
|
@ -53,3 +53,137 @@
|
||||||
versions: 0.82.4.dev
|
versions: 0.82.4.dev
|
||||||
seconds_per_case: 428.1
|
seconds_per_case: 428.1
|
||||||
total_cost: 1.8037
|
total_cost: 1.8037
|
||||||
|
|
||||||
|
|
||||||
|
- dirname: 2025-05-08-17-39-14--qwen3-235b-or-together-only
|
||||||
|
test_cases: 225
|
||||||
|
model: Qwen3 235B A22B on OpenRouter only TogetherAI, recommended /no_think settings
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: 328584e
|
||||||
|
pass_rate_1: 28.0
|
||||||
|
pass_rate_2: 54.7
|
||||||
|
pass_num_1: 63
|
||||||
|
pass_num_2: 123
|
||||||
|
percent_cases_well_formed: 90.7
|
||||||
|
error_outputs: 39
|
||||||
|
num_malformed_responses: 32
|
||||||
|
num_with_malformed_responses: 21
|
||||||
|
user_asks: 106
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
prompt_tokens: 2816606
|
||||||
|
completion_tokens: 362346
|
||||||
|
test_timeouts: 2
|
||||||
|
total_tests: 225
|
||||||
|
command: aider --model openrouter/qwen/qwen3-235b-a22b
|
||||||
|
date: 2025-05-08
|
||||||
|
versions: 0.82.4.dev
|
||||||
|
seconds_per_case: 77.2
|
||||||
|
total_cost: 0.6399
|
||||||
|
|
||||||
|
|
||||||
|
- dirname: 2025-04-30-04-49-37--Qwen3-235B-A22B-whole-nothink
|
||||||
|
test_cases: 225
|
||||||
|
model: Qwen3-235B-A22B with VLLM, bfloat16, recommended /no_think settings
|
||||||
|
edit_format: whole
|
||||||
|
commit_hash: 0c383df-dirty
|
||||||
|
pass_rate_1: 28.0
|
||||||
|
pass_rate_2: 65.3
|
||||||
|
pass_num_1: 63
|
||||||
|
pass_num_2: 147
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 3
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 166
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 3
|
||||||
|
test_timeouts: 0
|
||||||
|
total_tests: 225
|
||||||
|
command: aider --model openai/Qwen3-235B-A22B
|
||||||
|
date: 2025-04-30
|
||||||
|
versions: 0.81.4.dev
|
||||||
|
seconds_per_case: 166.0
|
||||||
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2025-04-30-04-49-50--Qwen3-235B-A22B-diff-nothink
|
||||||
|
test_cases: 225
|
||||||
|
model: Qwen3-235B-A22B with VLLM, bfloat16, recommended /no_think settings
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: 0c383df-dirty
|
||||||
|
pass_rate_1: 29.8
|
||||||
|
pass_rate_2: 61.3
|
||||||
|
pass_num_1: 67
|
||||||
|
pass_num_2: 138
|
||||||
|
percent_cases_well_formed: 94.7
|
||||||
|
error_outputs: 25
|
||||||
|
num_malformed_responses: 25
|
||||||
|
num_with_malformed_responses: 12
|
||||||
|
user_asks: 97
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 2
|
||||||
|
total_tests: 225
|
||||||
|
command: aider --model openai/Qwen3-235B-A22B
|
||||||
|
date: 2025-04-30
|
||||||
|
versions: 0.81.4.dev
|
||||||
|
seconds_per_case: 158.2
|
||||||
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2025-04-30-04-08-41--Qwen3-32B-whole-nothink
|
||||||
|
test_cases: 225
|
||||||
|
model: Qwen3-32B with VLLM, bfloat16, recommended /no_think settings
|
||||||
|
edit_format: whole
|
||||||
|
commit_hash: 0c383df-dirty
|
||||||
|
pass_rate_1: 20.4
|
||||||
|
pass_rate_2: 45.8
|
||||||
|
pass_num_1: 46
|
||||||
|
pass_num_2: 103
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 3
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 94
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 3
|
||||||
|
test_timeouts: 5
|
||||||
|
total_tests: 225
|
||||||
|
command: aider --model openai/Qwen3-32B
|
||||||
|
date: 2025-04-30
|
||||||
|
versions: 0.81.4.dev
|
||||||
|
seconds_per_case: 48.1
|
||||||
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2025-04-30-04-08-51--Qwen3-32B-diff-nothink
|
||||||
|
test_cases: 225
|
||||||
|
model: Qwen3-32B with VLLM, bfloat16, recommended /no_think settings
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: 0c383df-dirty
|
||||||
|
pass_rate_1: 20.4
|
||||||
|
pass_rate_2: 41.3
|
||||||
|
pass_num_1: 46
|
||||||
|
pass_num_2: 93
|
||||||
|
percent_cases_well_formed: 94.2
|
||||||
|
error_outputs: 17
|
||||||
|
num_malformed_responses: 14
|
||||||
|
num_with_malformed_responses: 13
|
||||||
|
user_asks: 83
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 3
|
||||||
|
test_timeouts: 4
|
||||||
|
total_tests: 225
|
||||||
|
command: aider --model openai/Qwen3-32B
|
||||||
|
date: 2025-04-30
|
||||||
|
versions: 0.81.4.dev
|
||||||
|
seconds_per_case: 59.4
|
||||||
|
total_cost: 0.0000
|
|
@ -1,13 +1,29 @@
|
||||||
---
|
---
|
||||||
layout: post
|
layout: post
|
||||||
title: Qwen3 Benchmark Results
|
title: Qwen3 benchmark results
|
||||||
excerpt: "Benchmark results for Qwen3 models using the Aider polyglot coding benchmark."
|
excerpt: "Benchmark results for Qwen3 models using the Aider polyglot coding benchmark."
|
||||||
date: 2025-05-08
|
date: 2025-05-08
|
||||||
---
|
---
|
||||||
|
|
||||||
You can add some introductory text for your blog post here.
|
# Qwen3 results on the aider polyglot benchmark
|
||||||
|
|
||||||
<h2 id="leaderboard-title">Qwen3 polyglot coding leaderboard</h2>
|
As [previously discussed when Qwen2.5 was released](/2024/11/21/quantization.html),
|
||||||
|
details matter when working with open source models for AI coding.
|
||||||
|
Proprietary models are served by their creators or trusted providers with stable inference settings.
|
||||||
|
Open source models are wonderful because anyone can serve them,
|
||||||
|
but API providers can use very different inference settings, quantizations, etc.
|
||||||
|
|
||||||
|
Below are collection of aider polyglot benchmark results for the new Qwen3 models.
|
||||||
|
Results are presented with various settings against various API providers,
|
||||||
|
with the hope of showcasing the strengths of these models and its providers.
|
||||||
|
|
||||||
|
{: .note }
|
||||||
|
This article is being updated as new results become available.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<h2 id="leaderboard-title">Qwen3 results on the aider polyglot benchmark</h2>
|
||||||
|
|
||||||
<div id="controls-container" style="display: flex; align-items: center; width: 100%; max-width: 800px; margin: 10px auto; gap: 10px; box-sizing: border-box; padding: 0 5px; position: relative;">
|
<div id="controls-container" style="display: flex; align-items: center; width: 100%; max-width: 800px; margin: 10px auto; gap: 10px; box-sizing: border-box; padding: 0 5px; position: relative;">
|
||||||
<input type="text" id="editSearchInput" placeholder="Search..." style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
|
<input type="text" id="editSearchInput" placeholder="Search..." style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;">
|
||||||
|
@ -252,6 +268,69 @@ You can add some introductory text for your blog post here.
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
const LEADERBOARD_CUSTOM_TITLE = "Aider polyglot coding benchmark results (selected)";
|
const LEADERBOARD_CUSTOM_TITLE = "Qwen3 results on the aider polyglot benchmark";
|
||||||
{% include leaderboard_table.js %}
|
{% include leaderboard_table.js %}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|
||||||
|
## OpenRouter only TogetherAI, recommended /no_think settings
|
||||||
|
|
||||||
|
These results were obtained with the
|
||||||
|
[recommended](https://huggingface.co/Qwen/Qwen3-235B-A22B#best-practices)
|
||||||
|
non-thinking model settings in `.aider.model.settings.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: openrouter/qwen/qwen3-235b-a22b
|
||||||
|
system_prompt_prefix: "/no_think"
|
||||||
|
use_temperature: 0.7
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 24000
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
min_p: 0.0
|
||||||
|
temperature: 0.7
|
||||||
|
extra_body:
|
||||||
|
provider:
|
||||||
|
order: ["Together"]
|
||||||
|
```
|
||||||
|
|
||||||
|
And then running aider:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
aider --model openrouter/qwen/qwen3-235b-a22b
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## OpenRouter, all providers, default settings (thinking)
|
||||||
|
|
||||||
|
These results were obtained by simply running aider as shown below, without any model specific settings.
|
||||||
|
This should have enabled thinking, assuming upstream API providers honor that convention for Qwen3.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
aider --model openrouter/qwen/qwen3-xxx
|
||||||
|
```
|
||||||
|
|
||||||
|
## VLLM, bfloat16, recommended /no_think
|
||||||
|
|
||||||
|
These [benchmarks results were obtained by GitHub user AlongWY](https://github.com/Aider-AI/aider/pull/3908)
|
||||||
|
with the
|
||||||
|
[recommended](https://huggingface.co/Qwen/Qwen3-235B-A22B#best-practices)
|
||||||
|
non-thinking model settings in `.aider.model.settings.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: openai/<model-name>
|
||||||
|
system_prompt_prefix: "/no_think"
|
||||||
|
use_temperature: 0.7
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 24000
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
min_p: 0.0
|
||||||
|
temperature: 0.7
|
||||||
|
```
|
||||||
|
|
||||||
|
And then running aider:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
aider --model openai/<model-name> --openai-api-base <url>
|
||||||
|
```
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue