diff --git a/aider/website/_data/quant.yml b/aider/website/_data/quant.yml index 59b70c2e1..b852a1ea5 100644 --- a/aider/website/_data/quant.yml +++ b/aider/website/_data/quant.yml @@ -1,6 +1,6 @@ - dirname: 2024-11-09-11-09-15--Qwen2.5-Coder-32B-Instruct test_cases: 133 - model: HuggingFace weights via glhf.chat + model: HuggingFace BF16 via glhf.chat released: 2024-11-12 edit_format: diff commit_hash: ec9982a @@ -22,9 +22,32 @@ seconds_per_case: 22.5 total_cost: 0.0000 +- dirname: 2024-11-22-14-53-26--hyperbolic-qwen25coder32binstruct + test_cases: 133 + model: Hyperbolic Qwen2.5-Coder-32B-Instruct BF16 + edit_format: diff + commit_hash: f9ef161, 17aef7b-dirty + pass_rate_1: 57.9 + pass_rate_2: 69.2 + percent_cases_well_formed: 91.7 + error_outputs: 30 + num_malformed_responses: 29 + num_with_malformed_responses: 11 + user_asks: 9 + lazy_comments: 0 + syntax_errors: 4 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct --openai-api-base https://api.hyperbolic.xyz/v1/ + date: 2024-11-22 + versions: 0.64.2.dev + seconds_per_case: 33.2 + total_cost: 0.0000 + - dirname: 2024-11-20-15-17-37--qwen25-32b-or-diff test_cases: 133 - model: openrouter/qwen/qwen-2.5-coder-32b-instruct + model: openrouter/qwen/qwen-2.5-coder-32b-instruct (mixed quants) edit_format: diff commit_hash: e917424 pass_rate_1: 49.6 @@ -67,3 +90,4 @@ versions: 0.64.2.dev seconds_per_case: 86.7 total_cost: 0.0000 + diff --git a/aider/website/_posts/2024-11-21-quantization.md b/aider/website/_posts/2024-11-21-quantization.md index 18588e4df..087b6749c 100644 --- a/aider/website/_posts/2024-11-21-quantization.md +++ b/aider/website/_posts/2024-11-21-quantization.md @@ -24,6 +24,17 @@ and local model servers like Ollama. {% include quant-chart.js %} +The graph above compares 4 different versions of the Qwen 2.5 Coder 32B Instruct model, +served both locally and from cloud providers. + +- The [HuggingFace BF16 weights](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) served via [glhf.chat](https://glhf.chat). +- Hyperbolic labs API for [qwen2-5-coder-32b-instruct](https://app.hyperbolic.xyz/models/qwen2-5-coder-32b-instruct), which is using BF16. This result is probably within the expected variance of the HF result. +- The results from [OpenRouter's mix of providers](https://openrouter.ai/qwen/qwen-2.5-coder-32b-instruct/providers) which serve the model with different levels of quantization. +- Ollama locally serving [qwen2.5-coder:32b-instruct-q4_K_M)](https://ollama.com/library/qwen2.5-coder:32b-instruct-q4_K_M), which has `Q4_K_M` quantization. + +The best version of the model rivals GPT-4o, while the worst performer +is more like GPT-3.5 Turbo level. +