o1-mini blog article

2025-05-30 09:14:59 +00:00 · 2024-09-12 14:07:06 -07:00 · 2024-09-12 14:07:06 -07:00 · 96587f5f46
commit 96587f5f46
parent 291b456a45
3 changed files with 181 additions and 1 deletions
--- a/aider/website/_config.yml
+++ b/aider/website/_config.yml
@ -41,4 +41,8 @@ repository: paul-gauthier/aider
 callouts:
  tip:
    title: Tip
-    color: green
+    color: green
  note:
    title: Note
    color: yellow
--- a/aider/website/_data/o1_results.yml
+++ b/aider/website/_data/o1_results.yml
@ -0,0 +1,94 @@
 - dirname: 2024-07-18-18-57-46--gpt-4o-mini-whole
  test_cases: 133
  model: gpt-4o-mini
  edit_format: whole
  commit_hash: d31eef3-dirty
  pass_rate_1: 40.6
  pass_rate_2: 55.6
  released: 2024-07-18
  percent_cases_well_formed: 100.0
  error_outputs: 1
  num_malformed_responses: 0
  num_with_malformed_responses: 0
  user_asks: 1
  lazy_comments: 0
  syntax_errors: 1
  indentation_errors: 0
  exhausted_context_windows: 0
  test_timeouts: 2
  command: aider --model gpt-4o-mini
  date: 2024-07-18
  versions: 0.44.1-dev
  seconds_per_case: 7.8
  total_cost: 0.0916
 - dirname: 2024-07-04-14-32-08--claude-3.5-sonnet-diff-continue
  test_cases: 133
  model: claude-3.5-sonnet
  edit_format: diff
  commit_hash: 35f21b5
  pass_rate_1: 57.1
  pass_rate_2: 77.4
  percent_cases_well_formed: 99.2
  error_outputs: 23
  released: 2024-06-20
  num_malformed_responses: 4
  num_with_malformed_responses: 1
  user_asks: 2
  lazy_comments: 0
  syntax_errors: 1
  indentation_errors: 0
  exhausted_context_windows: 0
  test_timeouts: 1
  command: aider --sonnet
  date: 2024-07-04
  versions: 0.42.1-dev
  seconds_per_case: 17.6
  total_cost: 3.6346
 - dirname: 2024-08-06-18-28-39--gpt-4o-2024-08-06-diff-again
  test_cases: 133
  model: gpt-4o-2024-08-06
  edit_format: diff
  commit_hash: ed9ed89
  pass_rate_1: 57.1
  pass_rate_2: 71.4
  percent_cases_well_formed: 98.5
  error_outputs: 18
  num_malformed_responses: 2
  num_with_malformed_responses: 2
  user_asks: 10
  lazy_comments: 0
  syntax_errors: 6
  indentation_errors: 2
  exhausted_context_windows: 0
  test_timeouts: 5
  released: 2024-08-06
  command: aider --model openai/gpt-4o-2024-08-06
  date: 2024-08-06
  versions: 0.48.1-dev
  seconds_per_case: 6.5
  total_cost: 0.0000
 - dirname: 2024-09-12-19-57-35--o1-mini-whole
  test_cases: 133
  model: o1-mini
  edit_format: whole
  commit_hash: 36fa773-dirty, 291b456
  pass_rate_1: 49.6
  pass_rate_2: 70.7
  percent_cases_well_formed: 100.0
  error_outputs: 0
  num_malformed_responses: 0
  num_with_malformed_responses: 0
  user_asks: 17
  lazy_comments: 0
  syntax_errors: 0
  indentation_errors: 0
  exhausted_context_windows: 0
  test_timeouts: 1
  command: aider --model openai/o1-mini
  date: 2024-09-12
  versions: 0.56.1.dev
  seconds_per_case: 103.0
  total_cost: 5.3725
--- a/aider/website/_posts/2024-09-12-o1.md
+++ b/aider/website/_posts/2024-09-12-o1.md
@ -0,0 +1,82 @@
 ---
 title: Benchmark results for OpenAI o1-mini
 excerpt: Preliminary benchmark results for the new OpenAI o1-mini model.
 nav_exclude: true
 ---
 {% if page.date %}
 <p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
 {% endif %}
 # Benchmark results for OpenAI o1-mini
 OpenAI o1-mini is priced similarly to GPT-4o and Claude 3.5 Sonnet. 
 o1-mini scored below those models
 when using the simple "whole" editing format.
 It was close enough to GPT-4o to be within the margin of error.
 The o1-mini model had trouble following the very simple whole editing format.
 It's possible that it would get a better score if aider prompted with
 more examples or was adapted to parse o1-mini's favorite way to mangle
 the response format.
 Note that o1-mini's "whole" score is compared against GPT-4o and Sonnet 
 "diff" results.
 Using diff is more challenging for GPT-4o and Sonnet,
 but it allows them to return search/replace blocks to 
 efficiently edit the source code.
 The whole format requires the o1-mini to return a fresh copy of the entire file,
 increasing costs and latency.
 {: .note }
 > These are *preliminiary* benchmark results, which will be updated as
 > additional benchmark runs complete and rate limits open up.
 <table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;">
  <thead style="background-color: #f2f2f2;">
    <tr>
      <th style="padding: 8px; text-align: left;">Model</th>
      <th style="padding: 8px; text-align: center;">Percent completed correctly</th>
      <th style="padding: 8px; text-align: center;">Percent using correct edit format</th>
      <th style="padding: 8px; text-align: left;">Command</th>
      <th style="padding: 8px; text-align: center;">Edit format</th>
    </tr>
  </thead>
  <tbody>
    {% assign edit_sorted = site.data.o1_results | sort: 'pass_rate_2' | reverse %}
    {% for row in edit_sorted %}
      <tr style="border-bottom: 1px solid #ddd;">
        <td style="padding: 8px;">{{ row.model }}</td>
        <td style="padding: 8px; text-align: center;">{{ row.pass_rate_2 }}%</td>
        <td style="padding: 8px; text-align: center;">{{ row.percent_cases_well_formed }}%</td>
        <td style="padding: 8px;"><code>{{ row.command }}</code></td>
        <td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td>
      </tr>
    {% endfor %}
  </tbody>
 </table>
 <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 {% include leaderboard_graph.html
  chart_id="editChart" 
  data=edit_sorted 
  row_prefix="edit-row" 
  pass_rate_key="pass_rate_2"
 %}
 <style>
  tr.selected {
    color: #0056b3;
  }
  table {
    table-layout: fixed;
  }
  td, th {
    word-wrap: break-word;
    overflow-wrap: break-word;
  }
  td:nth-child(3), td:nth-child(4) {
    font-size: 12px;
  }
 </style>