mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-25 06:44:59 +00:00
initial
This commit is contained in:
parent
7180cb049c
commit
1a9d4bfb1c
3 changed files with 257 additions and 8 deletions
123
aider/website/_data/qwq.yml
Normal file
123
aider/website/_data/qwq.yml
Normal file
|
@ -0,0 +1,123 @@
|
|||
|
||||
- dirname: 2024-11-28-21-38-50--architect-qwq-haiku-whole
|
||||
test_cases: 133
|
||||
model: QwQ + Haiku
|
||||
edit_format: architect
|
||||
commit_hash: e4a1d6f
|
||||
editor_model: claude-3-5-haiku-20241022
|
||||
editor_edit_format: editor-whole
|
||||
pass_rate_1: 54.1
|
||||
pass_rate_2: 71.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 4
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 196
|
||||
lazy_comments: 4
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model openrouter/qwen/qwq-32b-preview --editor-model claude-3-5-haiku-20241022 --edit-format editor-whole
|
||||
date: 2024-11-28
|
||||
versions: 0.65.2.dev
|
||||
seconds_per_case: 154.7
|
||||
total_cost: 1.4196
|
||||
|
||||
- dirname: 2024-11-28-19-24-35--architect-qwq-deepseek-whole
|
||||
test_cases: 133
|
||||
model: QwQ + DeepSeek V2.5
|
||||
edit_format: architect
|
||||
commit_hash: e4a1d6f
|
||||
editor_model: deepseek/deepseek-chat
|
||||
editor_edit_format: editor-whole
|
||||
pass_rate_1: 55.6
|
||||
pass_rate_2: 67.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 3
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 193
|
||||
lazy_comments: 2
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model openrouter/qwen/qwq-32b-preview --editor-model deepseek/deepseek-chat --edit-format editor-whole
|
||||
date: 2024-11-28
|
||||
versions: 0.65.2.dev
|
||||
seconds_per_case: 170.3
|
||||
total_cost: 0.1558
|
||||
|
||||
|
||||
- dirname: 2024-11-09-11-09-15--Qwen2.5-Coder-32B-Instruct
|
||||
test_cases: 133
|
||||
model: Qwen2.5 Coder 32B-I
|
||||
released: 2024-11-12
|
||||
edit_format: diff
|
||||
commit_hash: ec9982a
|
||||
pass_rate_1: 59.4
|
||||
pass_rate_2: 71.4
|
||||
percent_cases_well_formed: 94.7
|
||||
error_outputs: 17
|
||||
num_malformed_responses: 17
|
||||
num_with_malformed_responses: 7
|
||||
user_asks: 1
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct --openai-api-base https://glhf.chat/api/openai/v1 (via GLHF)
|
||||
date: 2024-11-09
|
||||
versions: 0.59.2.dev
|
||||
seconds_per_case: 22.5
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-12-04-00-10-39--architect-qwq-qwen
|
||||
test_cases: 132
|
||||
model: QwQ + Qwen2.5 Coder 32B-I
|
||||
edit_format: architect
|
||||
commit_hash: 51c02da
|
||||
editor_model: openrouter/qwen/qwen-2.5-coder-32b-instruct
|
||||
editor_edit_format: editor-whole
|
||||
pass_rate_1: 58.3
|
||||
pass_rate_2: 73.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 3
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 186
|
||||
lazy_comments: 5
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model openrouter/qwen/qwq-32b-preview --editor-model openrouter/qwen/qwen-2.5-coder-32b-instruct --editor-edit-format editor-whole
|
||||
date: 2024-12-04
|
||||
versions: 0.66.1.dev
|
||||
seconds_per_case: 144.1
|
||||
total_cost: 0.1444
|
||||
|
||||
- dirname: 2024-12-04-00-42-05--qwq-alone-whole
|
||||
test_cases: 133
|
||||
model: QwQ
|
||||
edit_format: whole
|
||||
commit_hash: 19004c0
|
||||
pass_rate_1: 33.1
|
||||
pass_rate_2: 42.1
|
||||
percent_cases_well_formed: 91.0
|
||||
error_outputs: 28
|
||||
num_malformed_responses: 12
|
||||
num_with_malformed_responses: 12
|
||||
user_asks: 119
|
||||
lazy_comments: 2
|
||||
syntax_errors: 22
|
||||
indentation_errors: 9
|
||||
exhausted_context_windows: 2
|
||||
test_timeouts: 1
|
||||
command: aider --model openrouter/qwen/qwq-32b-preview
|
||||
date: 2024-12-04
|
||||
versions: 0.66.1.dev
|
||||
seconds_per_case: 414.3
|
||||
total_cost: 0.0000
|
|
@ -71,13 +71,6 @@ document.addEventListener('DOMContentLoaded', function () {
|
|||
];
|
||||
}
|
||||
}
|
||||
},
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Aider code editing benchmark results',
|
||||
font: {
|
||||
size: 16
|
||||
}
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
|
@ -85,7 +78,7 @@ document.addEventListener('DOMContentLoaded', function () {
|
|||
beginAtZero: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Percent completed correctly',
|
||||
text: 'Aider code editing benchmark (%)',
|
||||
font: {
|
||||
size: 18
|
||||
}
|
||||
|
|
133
aider/website/_posts/2024-12-03-qwq.md
Normal file
133
aider/website/_posts/2024-12-03-qwq.md
Normal file
|
@ -0,0 +1,133 @@
|
|||
---
|
||||
title: QwQ is a code architect, not an editor
|
||||
excerpt: QwQ is reasoning model like o1, and needs to be used as an architect with another model as editor.
|
||||
#highlight_image: /assets/qwqization.jpg
|
||||
draft: false
|
||||
nav_exclude: true
|
||||
---
|
||||
{% if page.date %}
|
||||
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
|
||||
{% endif %}
|
||||
|
||||
# QwQ is a code architect, not an editor
|
||||
{: .no_toc }
|
||||
|
||||
<canvas id="qwqChart" width="800" height="500" style="margin: 20px 0"></canvas>
|
||||
|
||||
QwQ 32B Preview is a "reasoning" model, which spends a lot of tokens thinking before
|
||||
rendering a final response.
|
||||
In this way, it is similar to OpenAI's o1 models which are best used by
|
||||
[pairing the reasoning model as an architect with a traditional LLM as an editor](https://aider.chat/2024/09/26/architect.html).
|
||||
|
||||
Used alone, QwQ was unable to comply with even the simplest editing format.
|
||||
So it was not very successful at editing source code files.
|
||||
QwQ's solo score on the benchmark was underwhelming,
|
||||
far worse than the o1 models performing solo.
|
||||
|
||||
QwQ can perform better than the
|
||||
Qwen 2.5 Coder 32B Instruct model that it is based on
|
||||
when they are paired as architect + editor.
|
||||
This provides only a modest benefit,
|
||||
but results in a fairly slow overall response time.
|
||||
Each request must wait for QwQ to return all its thinking text
|
||||
and the ultimate solution.
|
||||
And then one must wait for Qwen to turn that large
|
||||
response into actual file edits.
|
||||
|
||||
Pairing QwQ with other sensible editor models performed worse than
|
||||
just using Qwen 2.5 Coder 32B Instruct alone.
|
||||
|
||||
QwQ+Qwen seems to be the best way to use QwQ, achieving a score of 74%.
|
||||
That is well off the
|
||||
SOTA results for this benchmark: Sonnet alone scores 84%, and
|
||||
o1-preview + o1-mini as architect + editor scores 85%.
|
||||
|
||||
|
||||
## QwQ specific editing formats
|
||||
|
||||
I spent some time experimenting with a variety of custom editing formats
|
||||
for QwQ.
|
||||
In particular, I tried to parse the QwQ response and discard the long
|
||||
sections of "thinking" and retain only the "final" solution.
|
||||
While I was able to successfully tease these sections apart,
|
||||
it did not translate to any significant improvement in the benchmarking results.
|
||||
|
||||
|
||||
## Results
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script>
|
||||
{% include qwq-chart.js %}
|
||||
</script>
|
||||
|
||||
<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;">
|
||||
<thead style="background-color: #f2f2f2;">
|
||||
<tr>
|
||||
<th style="padding: 8px; text-align: left;">Model</th>
|
||||
<th style="padding: 8px; text-align: center;">Percent completed correctly</th>
|
||||
<th style="padding: 8px; text-align: center;">Percent using correct edit format</th>
|
||||
<th style="padding: 8px; text-align: left;">Command</th>
|
||||
<th style="padding: 8px; text-align: center;">Edit format</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% assign qwq_sorted = site.data.qwq | sort: 'pass_rate_2' | reverse %}
|
||||
{% for row in qwq_sorted %}
|
||||
<tr style="border-bottom: 1px solid #ddd;">
|
||||
<td style="padding: 8px;">{{ row.model }}</td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.pass_rate_2 }}%</td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.percent_cases_well_formed }}%</td>
|
||||
<td style="padding: 8px;"><code>{{ row.command }}</code></td>
|
||||
<td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<style>
|
||||
tr.selected {
|
||||
color: #0056b3;
|
||||
}
|
||||
table {
|
||||
table-layout: fixed;
|
||||
}
|
||||
td, th {
|
||||
word-wrap: break-word;
|
||||
overflow-wrap: break-word;
|
||||
}
|
||||
td:nth-child(3), td:nth-child(4) {
|
||||
font-size: 12px;
|
||||
}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
document.getElementById('qwqSearchInput').addEventListener('keyup', function() {
|
||||
var input = this.value.toLowerCase();
|
||||
var rows = document.querySelectorAll('tbody tr');
|
||||
|
||||
rows.forEach(function(row) {
|
||||
var text = row.textContent.toLowerCase();
|
||||
if(text.includes(input)) {
|
||||
row.style.display = '';
|
||||
row.classList.add('selected');
|
||||
} else {
|
||||
row.style.display = 'none';
|
||||
row.classList.remove('selected');
|
||||
}
|
||||
});
|
||||
});
|
||||
</script>
|
||||
|
||||
## Open source model caveats
|
||||
|
||||
As discussed in a recent blog post,
|
||||
[details matter with open source models](https://aider.chat/2024/11/21/quantization.html).
|
||||
For clarity, I benchmarked against OpenRouter's endpoints for
|
||||
QwQ 32B Preview and Qwen 2.5 Coder 32B Instruct.
|
||||
For the other models, I went direct to their provider's APIs.
|
||||
|
||||
Having recently done extensive testing of OpenRouter's Qwen 2.5 Coder 32B Instruct,
|
||||
I feel comfortable using it. I blocked the provider Mancer due to small
|
||||
context window.
|
||||
|
||||
For QwQ 32B Preview, I blocked Fireworks because of its small context window.
|
Loading…
Add table
Add a link
Reference in a new issue