mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-10 22:55:00 +00:00
copy
This commit is contained in:
parent
c00ac80909
commit
72f52bdef0
1 changed files with 13 additions and 10 deletions
|
@ -9,6 +9,17 @@ nav_exclude: true
|
||||||
|
|
||||||
# Benchmark results for OpenAI o1-mini
|
# Benchmark results for OpenAI o1-mini
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||||
|
|
||||||
|
{% assign edit_sorted = site.data.o1_results | sort: 'pass_rate_2' | reverse %}
|
||||||
|
{% include leaderboard_graph.html
|
||||||
|
chart_id="editChart"
|
||||||
|
data=edit_sorted
|
||||||
|
row_prefix="edit-row"
|
||||||
|
pass_rate_key="pass_rate_2"
|
||||||
|
%}
|
||||||
|
|
||||||
|
|
||||||
OpenAI o1-mini is priced similarly to GPT-4o and Claude 3.5 Sonnet,
|
OpenAI o1-mini is priced similarly to GPT-4o and Claude 3.5 Sonnet,
|
||||||
but scored below those models.
|
but scored below those models.
|
||||||
|
|
||||||
|
@ -24,10 +35,10 @@ efficiently edit the source code, saving time and token costs.
|
||||||
The o1-mini model had trouble conforming to both the whole and diff edit formats.
|
The o1-mini model had trouble conforming to both the whole and diff edit formats.
|
||||||
Aider is extremely permissive and tries hard to accept anything close
|
Aider is extremely permissive and tries hard to accept anything close
|
||||||
to the correct formats.
|
to the correct formats.
|
||||||
|
|
||||||
It's possible that o1-mini would get better scores if aider prompted with
|
It's possible that o1-mini would get better scores if aider prompted with
|
||||||
more examples or was adapted to parse o1-mini's favorite ways to mangle
|
more examples or was adapted to parse o1-mini's favorite ways to mangle
|
||||||
the response formats.
|
the response formats.
|
||||||
|
|
||||||
Over time it may be possible to better harness o1-mini's capabilities through
|
Over time it may be possible to better harness o1-mini's capabilities through
|
||||||
different prompting and editing formats.
|
different prompting and editing formats.
|
||||||
|
|
||||||
|
@ -49,6 +60,7 @@ aider --model o1-preview
|
||||||
> These are *preliminiary* benchmark results, which will be updated as
|
> These are *preliminiary* benchmark results, which will be updated as
|
||||||
> additional benchmark runs complete and rate limits open up.
|
> additional benchmark runs complete and rate limits open up.
|
||||||
|
|
||||||
|
|
||||||
<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;">
|
<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;">
|
||||||
<thead style="background-color: #f2f2f2;">
|
<thead style="background-color: #f2f2f2;">
|
||||||
<tr>
|
<tr>
|
||||||
|
@ -60,7 +72,6 @@ aider --model o1-preview
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% assign edit_sorted = site.data.o1_results | sort: 'pass_rate_2' | reverse %}
|
|
||||||
{% for row in edit_sorted %}
|
{% for row in edit_sorted %}
|
||||||
<tr style="border-bottom: 1px solid #ddd;">
|
<tr style="border-bottom: 1px solid #ddd;">
|
||||||
<td style="padding: 8px;">{{ row.model }}</td>
|
<td style="padding: 8px;">{{ row.model }}</td>
|
||||||
|
@ -73,14 +84,6 @@ aider --model o1-preview
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
|
||||||
|
|
||||||
{% include leaderboard_graph.html
|
|
||||||
chart_id="editChart"
|
|
||||||
data=edit_sorted
|
|
||||||
row_prefix="edit-row"
|
|
||||||
pass_rate_key="pass_rate_2"
|
|
||||||
%}
|
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
tr.selected {
|
tr.selected {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue