mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-30 09:14:59 +00:00
o1-mini blog article
This commit is contained in:
parent
291b456a45
commit
96587f5f46
3 changed files with 181 additions and 1 deletions
|
@ -41,4 +41,8 @@ repository: paul-gauthier/aider
|
||||||
callouts:
|
callouts:
|
||||||
tip:
|
tip:
|
||||||
title: Tip
|
title: Tip
|
||||||
color: green
|
color: green
|
||||||
|
note:
|
||||||
|
title: Note
|
||||||
|
color: yellow
|
||||||
|
|
94
aider/website/_data/o1_results.yml
Normal file
94
aider/website/_data/o1_results.yml
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
- dirname: 2024-07-18-18-57-46--gpt-4o-mini-whole
|
||||||
|
test_cases: 133
|
||||||
|
model: gpt-4o-mini
|
||||||
|
edit_format: whole
|
||||||
|
commit_hash: d31eef3-dirty
|
||||||
|
pass_rate_1: 40.6
|
||||||
|
pass_rate_2: 55.6
|
||||||
|
released: 2024-07-18
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 1
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 1
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 1
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 2
|
||||||
|
command: aider --model gpt-4o-mini
|
||||||
|
date: 2024-07-18
|
||||||
|
versions: 0.44.1-dev
|
||||||
|
seconds_per_case: 7.8
|
||||||
|
total_cost: 0.0916
|
||||||
|
|
||||||
|
- dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
|
||||||
|
test_cases: 133
|
||||||
|
model: claude-3.5-sonnet
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: 35f21b5
|
||||||
|
pass_rate_1: 57.1
|
||||||
|
pass_rate_2: 77.4
|
||||||
|
percent_cases_well_formed: 99.2
|
||||||
|
error_outputs: 23
|
||||||
|
released: 2024-06-20
|
||||||
|
num_malformed_responses: 4
|
||||||
|
num_with_malformed_responses: 1
|
||||||
|
user_asks: 2
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 1
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 1
|
||||||
|
command: aider --sonnet
|
||||||
|
date: 2024-07-04
|
||||||
|
versions: 0.42.1-dev
|
||||||
|
seconds_per_case: 17.6
|
||||||
|
total_cost: 3.6346
|
||||||
|
|
||||||
|
- dirname: 2024-08-06-18-28-39--gpt-4o-2024-08-06-diff-again
|
||||||
|
test_cases: 133
|
||||||
|
model: gpt-4o-2024-08-06
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: ed9ed89
|
||||||
|
pass_rate_1: 57.1
|
||||||
|
pass_rate_2: 71.4
|
||||||
|
percent_cases_well_formed: 98.5
|
||||||
|
error_outputs: 18
|
||||||
|
num_malformed_responses: 2
|
||||||
|
num_with_malformed_responses: 2
|
||||||
|
user_asks: 10
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 6
|
||||||
|
indentation_errors: 2
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 5
|
||||||
|
released: 2024-08-06
|
||||||
|
command: aider --model openai/gpt-4o-2024-08-06
|
||||||
|
date: 2024-08-06
|
||||||
|
versions: 0.48.1-dev
|
||||||
|
seconds_per_case: 6.5
|
||||||
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2024-09-12-19-57-35--o1-mini-whole
|
||||||
|
test_cases: 133
|
||||||
|
model: o1-mini
|
||||||
|
edit_format: whole
|
||||||
|
commit_hash: 36fa773-dirty, 291b456
|
||||||
|
pass_rate_1: 49.6
|
||||||
|
pass_rate_2: 70.7
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 0
|
||||||
|
num_malformed_responses: 0
|
||||||
|
num_with_malformed_responses: 0
|
||||||
|
user_asks: 17
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 1
|
||||||
|
command: aider --model openai/o1-mini
|
||||||
|
date: 2024-09-12
|
||||||
|
versions: 0.56.1.dev
|
||||||
|
seconds_per_case: 103.0
|
||||||
|
total_cost: 5.3725
|
82
aider/website/_posts/2024-09-12-o1.md
Normal file
82
aider/website/_posts/2024-09-12-o1.md
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
---
|
||||||
|
title: Benchmark results for OpenAI o1-mini
|
||||||
|
excerpt: Preliminary benchmark results for the new OpenAI o1-mini model.
|
||||||
|
nav_exclude: true
|
||||||
|
---
|
||||||
|
{% if page.date %}
|
||||||
|
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
# Benchmark results for OpenAI o1-mini
|
||||||
|
|
||||||
|
OpenAI o1-mini is priced similarly to GPT-4o and Claude 3.5 Sonnet.
|
||||||
|
o1-mini scored below those models
|
||||||
|
when using the simple "whole" editing format.
|
||||||
|
It was close enough to GPT-4o to be within the margin of error.
|
||||||
|
|
||||||
|
The o1-mini model had trouble following the very simple whole editing format.
|
||||||
|
It's possible that it would get a better score if aider prompted with
|
||||||
|
more examples or was adapted to parse o1-mini's favorite way to mangle
|
||||||
|
the response format.
|
||||||
|
|
||||||
|
Note that o1-mini's "whole" score is compared against GPT-4o and Sonnet
|
||||||
|
"diff" results.
|
||||||
|
Using diff is more challenging for GPT-4o and Sonnet,
|
||||||
|
but it allows them to return search/replace blocks to
|
||||||
|
efficiently edit the source code.
|
||||||
|
The whole format requires the o1-mini to return a fresh copy of the entire file,
|
||||||
|
increasing costs and latency.
|
||||||
|
|
||||||
|
|
||||||
|
{: .note }
|
||||||
|
> These are *preliminiary* benchmark results, which will be updated as
|
||||||
|
> additional benchmark runs complete and rate limits open up.
|
||||||
|
|
||||||
|
<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;">
|
||||||
|
<thead style="background-color: #f2f2f2;">
|
||||||
|
<tr>
|
||||||
|
<th style="padding: 8px; text-align: left;">Model</th>
|
||||||
|
<th style="padding: 8px; text-align: center;">Percent completed correctly</th>
|
||||||
|
<th style="padding: 8px; text-align: center;">Percent using correct edit format</th>
|
||||||
|
<th style="padding: 8px; text-align: left;">Command</th>
|
||||||
|
<th style="padding: 8px; text-align: center;">Edit format</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% assign edit_sorted = site.data.o1_results | sort: 'pass_rate_2' | reverse %}
|
||||||
|
{% for row in edit_sorted %}
|
||||||
|
<tr style="border-bottom: 1px solid #ddd;">
|
||||||
|
<td style="padding: 8px;">{{ row.model }}</td>
|
||||||
|
<td style="padding: 8px; text-align: center;">{{ row.pass_rate_2 }}%</td>
|
||||||
|
<td style="padding: 8px; text-align: center;">{{ row.percent_cases_well_formed }}%</td>
|
||||||
|
<td style="padding: 8px;"><code>{{ row.command }}</code></td>
|
||||||
|
<td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||||
|
|
||||||
|
{% include leaderboard_graph.html
|
||||||
|
chart_id="editChart"
|
||||||
|
data=edit_sorted
|
||||||
|
row_prefix="edit-row"
|
||||||
|
pass_rate_key="pass_rate_2"
|
||||||
|
%}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
tr.selected {
|
||||||
|
color: #0056b3;
|
||||||
|
}
|
||||||
|
table {
|
||||||
|
table-layout: fixed;
|
||||||
|
}
|
||||||
|
td, th {
|
||||||
|
word-wrap: break-word;
|
||||||
|
overflow-wrap: break-word;
|
||||||
|
}
|
||||||
|
td:nth-child(3), td:nth-child(4) {
|
||||||
|
font-size: 12px;
|
||||||
|
}
|
||||||
|
</style>
|
Loading…
Add table
Add a link
Reference in a new issue