From 87768303068fdeac4d79a31471aabd104f0cf613 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 8 May 2025 11:32:24 -0700 Subject: [PATCH] copy --- aider/resources/model-settings.yml | 44 +++---- aider/website/_data/qwen3_leaderboard.yml | 138 +++++++++++++++++++++- aider/website/_posts/2025-05-08-qwen3.md | 87 +++++++++++++- 3 files changed, 241 insertions(+), 28 deletions(-) diff --git a/aider/resources/model-settings.yml b/aider/resources/model-settings.yml index c1a3ea833..338521179 100644 --- a/aider/resources/model-settings.yml +++ b/aider/resources/model-settings.yml @@ -1412,27 +1412,27 @@ use_repo_map: true weak_model_name: openrouter/google/gemini-2.0-flash-001 -- name: openrouter/qwen/qwen3-235b-a22b - system_prompt_prefix: "/no_think" - use_temperature: 0.7 - extra_params: - max_tokens: 24000 - top_p: 0.8 - top_k: 20 - min_p: 0.0 - temperature: 0.7 - extra_body: - provider: - order: ["Together"] +#- name: openrouter/qwen/qwen3-235b-a22b +# system_prompt_prefix: "/no_think" +# use_temperature: 0.7 +# extra_params: +# max_tokens: 24000 +# top_p: 0.8 +# top_k: 20 +# min_p: 0.0 +# temperature: 0.7 +# extra_body: +# provider: +# order: ["Together"] -- name: together_ai/Qwen/Qwen3-235B-A22B-fp8-tput - system_prompt_prefix: "/no_think" - use_temperature: 0.7 - reasoning_tag: think - extra_params: - max_tokens: 24000 - top_p: 0.8 - top_k: 20 - min_p: 0.0 - temperature: 0.7 +#- name: together_ai/Qwen/Qwen3-235B-A22B-fp8-tput +# system_prompt_prefix: "/no_think" +# use_temperature: 0.7 +# reasoning_tag: think +# extra_params: +# max_tokens: 24000 +# top_p: 0.8 +# top_k: 20 +# min_p: 0.0 +# temperature: 0.7 \ No newline at end of file diff --git a/aider/website/_data/qwen3_leaderboard.yml b/aider/website/_data/qwen3_leaderboard.yml index 07e378674..faa20fab7 100644 --- a/aider/website/_data/qwen3_leaderboard.yml +++ b/aider/website/_data/qwen3_leaderboard.yml @@ -1,6 +1,6 @@ - dirname: 2025-05-08-03-20-24--qwen3-32b-default test_cases: 225 - model: Qwen3 32B + model: Qwen3 32B on OpenRouter, all providers, default settings (thinking) edit_format: diff commit_hash: aaacee5-dirty, aeaf259 pass_rate_1: 14.2 @@ -28,7 +28,7 @@ - dirname: 2025-05-08-03-22-37--qwen3-235b-defaults test_cases: 225 - model: Qwen3 235B A22B + model: Qwen3 235B A22B on OpenRouter, all providers, default settings (thinking) edit_format: diff commit_hash: aaacee5-dirty pass_rate_1: 17.3 @@ -53,3 +53,137 @@ versions: 0.82.4.dev seconds_per_case: 428.1 total_cost: 1.8037 + + +- dirname: 2025-05-08-17-39-14--qwen3-235b-or-together-only + test_cases: 225 + model: Qwen3 235B A22B on OpenRouter only TogetherAI, recommended /no_think settings + edit_format: diff + commit_hash: 328584e + pass_rate_1: 28.0 + pass_rate_2: 54.7 + pass_num_1: 63 + pass_num_2: 123 + percent_cases_well_formed: 90.7 + error_outputs: 39 + num_malformed_responses: 32 + num_with_malformed_responses: 21 + user_asks: 106 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + prompt_tokens: 2816606 + completion_tokens: 362346 + test_timeouts: 2 + total_tests: 225 + command: aider --model openrouter/qwen/qwen3-235b-a22b + date: 2025-05-08 + versions: 0.82.4.dev + seconds_per_case: 77.2 + total_cost: 0.6399 + + +- dirname: 2025-04-30-04-49-37--Qwen3-235B-A22B-whole-nothink + test_cases: 225 + model: Qwen3-235B-A22B with VLLM, bfloat16, recommended /no_think settings + edit_format: whole + commit_hash: 0c383df-dirty + pass_rate_1: 28.0 + pass_rate_2: 65.3 + pass_num_1: 63 + pass_num_2: 147 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 166 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 3 + test_timeouts: 0 + total_tests: 225 + command: aider --model openai/Qwen3-235B-A22B + date: 2025-04-30 + versions: 0.81.4.dev + seconds_per_case: 166.0 + total_cost: 0.0000 + +- dirname: 2025-04-30-04-49-50--Qwen3-235B-A22B-diff-nothink + test_cases: 225 + model: Qwen3-235B-A22B with VLLM, bfloat16, recommended /no_think settings + edit_format: diff + commit_hash: 0c383df-dirty + pass_rate_1: 29.8 + pass_rate_2: 61.3 + pass_num_1: 67 + pass_num_2: 138 + percent_cases_well_formed: 94.7 + error_outputs: 25 + num_malformed_responses: 25 + num_with_malformed_responses: 12 + user_asks: 97 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + total_tests: 225 + command: aider --model openai/Qwen3-235B-A22B + date: 2025-04-30 + versions: 0.81.4.dev + seconds_per_case: 158.2 + total_cost: 0.0000 + +- dirname: 2025-04-30-04-08-41--Qwen3-32B-whole-nothink + test_cases: 225 + model: Qwen3-32B with VLLM, bfloat16, recommended /no_think settings + edit_format: whole + commit_hash: 0c383df-dirty + pass_rate_1: 20.4 + pass_rate_2: 45.8 + pass_num_1: 46 + pass_num_2: 103 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 94 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 3 + test_timeouts: 5 + total_tests: 225 + command: aider --model openai/Qwen3-32B + date: 2025-04-30 + versions: 0.81.4.dev + seconds_per_case: 48.1 + total_cost: 0.0000 + +- dirname: 2025-04-30-04-08-51--Qwen3-32B-diff-nothink + test_cases: 225 + model: Qwen3-32B with VLLM, bfloat16, recommended /no_think settings + edit_format: diff + commit_hash: 0c383df-dirty + pass_rate_1: 20.4 + pass_rate_2: 41.3 + pass_num_1: 46 + pass_num_2: 93 + percent_cases_well_formed: 94.2 + error_outputs: 17 + num_malformed_responses: 14 + num_with_malformed_responses: 13 + user_asks: 83 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 3 + test_timeouts: 4 + total_tests: 225 + command: aider --model openai/Qwen3-32B + date: 2025-04-30 + versions: 0.81.4.dev + seconds_per_case: 59.4 + total_cost: 0.0000 \ No newline at end of file diff --git a/aider/website/_posts/2025-05-08-qwen3.md b/aider/website/_posts/2025-05-08-qwen3.md index 51f6d4292..4923f5330 100644 --- a/aider/website/_posts/2025-05-08-qwen3.md +++ b/aider/website/_posts/2025-05-08-qwen3.md @@ -1,13 +1,29 @@ --- layout: post -title: Qwen3 Benchmark Results +title: Qwen3 benchmark results excerpt: "Benchmark results for Qwen3 models using the Aider polyglot coding benchmark." date: 2025-05-08 --- -You can add some introductory text for your blog post here. +# Qwen3 results on the aider polyglot benchmark -

Qwen3 polyglot coding leaderboard

+As [previously discussed when Qwen2.5 was released](/2024/11/21/quantization.html), +details matter when working with open source models for AI coding. +Proprietary models are served by their creators or trusted providers with stable inference settings. +Open source models are wonderful because anyone can serve them, +but API providers can use very different inference settings, quantizations, etc. + +Below are collection of aider polyglot benchmark results for the new Qwen3 models. +Results are presented with various settings against various API providers, +with the hope of showcasing the strengths of these models and its providers. + +{: .note } +This article is being updated as new results become available. + + + + +

Qwen3 results on the aider polyglot benchmark

@@ -252,6 +268,69 @@ You can add some introductory text for your blog post here. + + +## OpenRouter only TogetherAI, recommended /no_think settings + +These results were obtained with the +[recommended](https://huggingface.co/Qwen/Qwen3-235B-A22B#best-practices) +non-thinking model settings in `.aider.model.settings.yml`: + +```yaml +- name: openrouter/qwen/qwen3-235b-a22b + system_prompt_prefix: "/no_think" + use_temperature: 0.7 + extra_params: + max_tokens: 24000 + top_p: 0.8 + top_k: 20 + min_p: 0.0 + temperature: 0.7 + extra_body: + provider: + order: ["Together"] +``` + +And then running aider: + +```bash +aider --model openrouter/qwen/qwen3-235b-a22b +``` + + +## OpenRouter, all providers, default settings (thinking) + +These results were obtained by simply running aider as shown below, without any model specific settings. +This should have enabled thinking, assuming upstream API providers honor that convention for Qwen3. + +```bash +aider --model openrouter/qwen/qwen3-xxx +``` + +## VLLM, bfloat16, recommended /no_think + +These [benchmarks results were obtained by GitHub user AlongWY](https://github.com/Aider-AI/aider/pull/3908) +with the +[recommended](https://huggingface.co/Qwen/Qwen3-235B-A22B#best-practices) +non-thinking model settings in `.aider.model.settings.yml`: + +```yaml +- name: openai/ + system_prompt_prefix: "/no_think" + use_temperature: 0.7 + extra_params: + max_tokens: 24000 + top_p: 0.8 + top_k: 20 + min_p: 0.0 + temperature: 0.7 +``` + +And then running aider: + +```bash +aider --model openai/ --openai-api-base +```