mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-24 22:34:59 +00:00
initial
This commit is contained in:
parent
7180cb049c
commit
1a9d4bfb1c
3 changed files with 257 additions and 8 deletions
123
aider/website/_data/qwq.yml
Normal file
123
aider/website/_data/qwq.yml
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
|
||||||
|
- dirname: 2024-11-28-21-38-50--architect-qwq-haiku-whole
|
||||||
|
test_cases: 133
|
||||||
|
model: QwQ + Haiku
|
||||||
|
edit_format: architect
|
||||||
|
commit_hash: e4a1d6f
|
||||||
|
editor_model: claude-3-5-haiku-20241022
|
||||||
|
editor_edit_format: editor-whole
|
||||||
|
pass_rate_1: 54.1
|
||||||
|
pass_rate_2: 71.4
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 4
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 196
|
||||||
|
lazy_comments: 4
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 0
|
||||||
|
command: aider --model openrouter/qwen/qwq-32b-preview --editor-model claude-3-5-haiku-20241022 --edit-format editor-whole
|
||||||
|
date: 2024-11-28
|
||||||
|
versions: 0.65.2.dev
|
||||||
|
seconds_per_case: 154.7
|
||||||
|
total_cost: 1.4196
|
||||||
|
|
||||||
|
- dirname: 2024-11-28-19-24-35--architect-qwq-deepseek-whole
|
||||||
|
test_cases: 133
|
||||||
|
model: QwQ + DeepSeek V2.5
|
||||||
|
edit_format: architect
|
||||||
|
commit_hash: e4a1d6f
|
||||||
|
editor_model: deepseek/deepseek-chat
|
||||||
|
editor_edit_format: editor-whole
|
||||||
|
pass_rate_1: 55.6
|
||||||
|
pass_rate_2: 67.7
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 3
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 193
|
||||||
|
lazy_comments: 2
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 0
|
||||||
|
command: aider --model openrouter/qwen/qwq-32b-preview --editor-model deepseek/deepseek-chat --edit-format editor-whole
|
||||||
|
date: 2024-11-28
|
||||||
|
versions: 0.65.2.dev
|
||||||
|
seconds_per_case: 170.3
|
||||||
|
total_cost: 0.1558
|
||||||
|
|
||||||
|
|
||||||
|
- dirname: 2024-11-09-11-09-15--Qwen2.5-Coder-32B-Instruct
|
||||||
|
test_cases: 133
|
||||||
|
model: Qwen2.5 Coder 32B-I
|
||||||
|
released: 2024-11-12
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: ec9982a
|
||||||
|
pass_rate_1: 59.4
|
||||||
|
pass_rate_2: 71.4
|
||||||
|
percent_cases_well_formed: 94.7
|
||||||
|
error_outputs: 17
|
||||||
|
num_malformed_responses: 17
|
||||||
|
num_with_malformed_responses: 7
|
||||||
|
user_asks: 1
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 3
|
||||||
|
command: aider --model openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct --openai-api-base https://glhf.chat/api/openai/v1 (via GLHF)
|
||||||
|
date: 2024-11-09
|
||||||
|
versions: 0.59.2.dev
|
||||||
|
seconds_per_case: 22.5
|
||||||
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2024-12-04-00-10-39--architect-qwq-qwen
|
||||||
|
test_cases: 132
|
||||||
|
model: QwQ + Qwen2.5 Coder 32B-I
|
||||||
|
edit_format: architect
|
||||||
|
commit_hash: 51c02da
|
||||||
|
editor_model: openrouter/qwen/qwen-2.5-coder-32b-instruct
|
||||||
|
editor_edit_format: editor-whole
|
||||||
|
pass_rate_1: 58.3
|
||||||
|
pass_rate_2: 73.6
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 3
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 186
|
||||||
|
lazy_comments: 5
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 0
|
||||||
|
command: aider --model openrouter/qwen/qwq-32b-preview --editor-model openrouter/qwen/qwen-2.5-coder-32b-instruct --editor-edit-format editor-whole
|
||||||
|
date: 2024-12-04
|
||||||
|
versions: 0.66.1.dev
|
||||||
|
seconds_per_case: 144.1
|
||||||
|
total_cost: 0.1444
|
||||||
|
|
||||||
|
- dirname: 2024-12-04-00-42-05--qwq-alone-whole
|
||||||
|
test_cases: 133
|
||||||
|
model: QwQ
|
||||||
|
edit_format: whole
|
||||||
|
commit_hash: 19004c0
|
||||||
|
pass_rate_1: 33.1
|
||||||
|
pass_rate_2: 42.1
|
||||||
|
percent_cases_well_formed: 91.0
|
||||||
|
error_outputs: 28
|
||||||
|
num_malformed_responses: 12
|
||||||
|
num_with_malformed_responses: 12
|
||||||
|
user_asks: 119
|
||||||
|
lazy_comments: 2
|
||||||
|
syntax_errors: 22
|
||||||
|
indentation_errors: 9
|
||||||
|
exhausted_context_windows: 2
|
||||||
|
test_timeouts: 1
|
||||||
|
command: aider --model openrouter/qwen/qwq-32b-preview
|
||||||
|
date: 2024-12-04
|
||||||
|
versions: 0.66.1.dev
|
||||||
|
seconds_per_case: 414.3
|
||||||
|
total_cost: 0.0000
|
|
@ -71,13 +71,6 @@ document.addEventListener('DOMContentLoaded', function () {
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
|
||||||
title: {
|
|
||||||
display: true,
|
|
||||||
text: 'Aider code editing benchmark results',
|
|
||||||
font: {
|
|
||||||
size: 16
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
scales: {
|
scales: {
|
||||||
|
@ -85,7 +78,7 @@ document.addEventListener('DOMContentLoaded', function () {
|
||||||
beginAtZero: true,
|
beginAtZero: true,
|
||||||
title: {
|
title: {
|
||||||
display: true,
|
display: true,
|
||||||
text: 'Percent completed correctly',
|
text: 'Aider code editing benchmark (%)',
|
||||||
font: {
|
font: {
|
||||||
size: 18
|
size: 18
|
||||||
}
|
}
|
||||||
|
|
133
aider/website/_posts/2024-12-03-qwq.md
Normal file
133
aider/website/_posts/2024-12-03-qwq.md
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
---
|
||||||
|
title: QwQ is a code architect, not an editor
|
||||||
|
excerpt: QwQ is reasoning model like o1, and needs to be used as an architect with another model as editor.
|
||||||
|
#highlight_image: /assets/qwqization.jpg
|
||||||
|
draft: false
|
||||||
|
nav_exclude: true
|
||||||
|
---
|
||||||
|
{% if page.date %}
|
||||||
|
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
# QwQ is a code architect, not an editor
|
||||||
|
{: .no_toc }
|
||||||
|
|
||||||
|
<canvas id="qwqChart" width="800" height="500" style="margin: 20px 0"></canvas>
|
||||||
|
|
||||||
|
QwQ 32B Preview is a "reasoning" model, which spends a lot of tokens thinking before
|
||||||
|
rendering a final response.
|
||||||
|
In this way, it is similar to OpenAI's o1 models which are best used by
|
||||||
|
[pairing the reasoning model as an architect with a traditional LLM as an editor](https://aider.chat/2024/09/26/architect.html).
|
||||||
|
|
||||||
|
Used alone, QwQ was unable to comply with even the simplest editing format.
|
||||||
|
So it was not very successful at editing source code files.
|
||||||
|
QwQ's solo score on the benchmark was underwhelming,
|
||||||
|
far worse than the o1 models performing solo.
|
||||||
|
|
||||||
|
QwQ can perform better than the
|
||||||
|
Qwen 2.5 Coder 32B Instruct model that it is based on
|
||||||
|
when they are paired as architect + editor.
|
||||||
|
This provides only a modest benefit,
|
||||||
|
but results in a fairly slow overall response time.
|
||||||
|
Each request must wait for QwQ to return all its thinking text
|
||||||
|
and the ultimate solution.
|
||||||
|
And then one must wait for Qwen to turn that large
|
||||||
|
response into actual file edits.
|
||||||
|
|
||||||
|
Pairing QwQ with other sensible editor models performed worse than
|
||||||
|
just using Qwen 2.5 Coder 32B Instruct alone.
|
||||||
|
|
||||||
|
QwQ+Qwen seems to be the best way to use QwQ, achieving a score of 74%.
|
||||||
|
That is well off the
|
||||||
|
SOTA results for this benchmark: Sonnet alone scores 84%, and
|
||||||
|
o1-preview + o1-mini as architect + editor scores 85%.
|
||||||
|
|
||||||
|
|
||||||
|
## QwQ specific editing formats
|
||||||
|
|
||||||
|
I spent some time experimenting with a variety of custom editing formats
|
||||||
|
for QwQ.
|
||||||
|
In particular, I tried to parse the QwQ response and discard the long
|
||||||
|
sections of "thinking" and retain only the "final" solution.
|
||||||
|
While I was able to successfully tease these sections apart,
|
||||||
|
it did not translate to any significant improvement in the benchmarking results.
|
||||||
|
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||||
|
<script>
|
||||||
|
{% include qwq-chart.js %}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;">
|
||||||
|
<thead style="background-color: #f2f2f2;">
|
||||||
|
<tr>
|
||||||
|
<th style="padding: 8px; text-align: left;">Model</th>
|
||||||
|
<th style="padding: 8px; text-align: center;">Percent completed correctly</th>
|
||||||
|
<th style="padding: 8px; text-align: center;">Percent using correct edit format</th>
|
||||||
|
<th style="padding: 8px; text-align: left;">Command</th>
|
||||||
|
<th style="padding: 8px; text-align: center;">Edit format</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% assign qwq_sorted = site.data.qwq | sort: 'pass_rate_2' | reverse %}
|
||||||
|
{% for row in qwq_sorted %}
|
||||||
|
<tr style="border-bottom: 1px solid #ddd;">
|
||||||
|
<td style="padding: 8px;">{{ row.model }}</td>
|
||||||
|
<td style="padding: 8px; text-align: center;">{{ row.pass_rate_2 }}%</td>
|
||||||
|
<td style="padding: 8px; text-align: center;">{{ row.percent_cases_well_formed }}%</td>
|
||||||
|
<td style="padding: 8px;"><code>{{ row.command }}</code></td>
|
||||||
|
<td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
tr.selected {
|
||||||
|
color: #0056b3;
|
||||||
|
}
|
||||||
|
table {
|
||||||
|
table-layout: fixed;
|
||||||
|
}
|
||||||
|
td, th {
|
||||||
|
word-wrap: break-word;
|
||||||
|
overflow-wrap: break-word;
|
||||||
|
}
|
||||||
|
td:nth-child(3), td:nth-child(4) {
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
document.getElementById('qwqSearchInput').addEventListener('keyup', function() {
|
||||||
|
var input = this.value.toLowerCase();
|
||||||
|
var rows = document.querySelectorAll('tbody tr');
|
||||||
|
|
||||||
|
rows.forEach(function(row) {
|
||||||
|
var text = row.textContent.toLowerCase();
|
||||||
|
if(text.includes(input)) {
|
||||||
|
row.style.display = '';
|
||||||
|
row.classList.add('selected');
|
||||||
|
} else {
|
||||||
|
row.style.display = 'none';
|
||||||
|
row.classList.remove('selected');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
## Open source model caveats
|
||||||
|
|
||||||
|
As discussed in a recent blog post,
|
||||||
|
[details matter with open source models](https://aider.chat/2024/11/21/quantization.html).
|
||||||
|
For clarity, I benchmarked against OpenRouter's endpoints for
|
||||||
|
QwQ 32B Preview and Qwen 2.5 Coder 32B Instruct.
|
||||||
|
For the other models, I went direct to their provider's APIs.
|
||||||
|
|
||||||
|
Having recently done extensive testing of OpenRouter's Qwen 2.5 Coder 32B Instruct,
|
||||||
|
I feel comfortable using it. I blocked the provider Mancer due to small
|
||||||
|
context window.
|
||||||
|
|
||||||
|
For QwQ 32B Preview, I blocked Fireworks because of its small context window.
|
Loading…
Add table
Add a link
Reference in a new issue