Merge branch 'main' into patch-1

2025-05-31 01:35:00 +00:00 · 2024-12-26 07:05:33 -05:00 · 2024-12-26 07:05:33 -05:00 · f1e623ec5a
commit f1e623ec5a
parent 4561f0c79e ec2da0a399
6 changed files with 304 additions and 4 deletions
--- a/aider/website/_data/o1_polyglot_leaderboard.yml
+++ b/aider/website/_data/o1_polyglot_leaderboard.yml
@ -0,0 +1,259 @@
+- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
+  test_cases: 225
+  model: gpt-4o-mini-2024-07-18
+  edit_format: whole
+  commit_hash: a755079-dirty
+  pass_rate_1: 0.9
+  pass_rate_2: 3.6
+  pass_num_1: 2
+  pass_num_2: 8
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 36
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  total_tests: 225
+  command: aider --model gpt-4o-mini-2024-07-18
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 17.3
+  total_cost: 0.3236
+
+- dirname: 2024-12-21-18-44-28--polyglot-sonnet
+  test_cases: 225
+  model: claude-3-5-sonnet-20241022
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 18.7
+  pass_rate_2: 45.3
+  pass_num_1: 42
+  pass_num_2: 102
+  percent_cases_well_formed: 100.0
+  error_outputs: 1
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 14
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 12
+  total_tests: 225
+  command: aider --model claude-3-5-sonnet-20241022
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 30.8
+  total_cost: 13.4847
+
+- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
+  test_cases: 225
+  model: gpt-4o-2024-11-20
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 4.9
+  pass_rate_2: 15.1
+  pass_num_1: 11
+  pass_num_2: 34
+  percent_cases_well_formed: 96.0
+  error_outputs: 12
+  num_malformed_responses: 11
+  num_with_malformed_responses: 9
+  user_asks: 34
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 19
+  total_tests: 225
+  command: aider --model gpt-4o-2024-11-20
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 22.2
+  total_cost: 7.1835
+
+- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
+  test_cases: 224
+  model: o1-2024-12-17 (high)
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 23.7
+  pass_rate_2: 61.7
+  pass_num_1: 53
+  pass_num_2: 139
+  percent_cases_well_formed: 91.5
+  error_outputs: 25
+  num_malformed_responses: 24
+  num_with_malformed_responses: 19
+  user_asks: 16
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  total_tests: 225
+  command: aider --model openrouter/openai/o1
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 133.2
+  total_cost: 0.0000
+
+- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
+  test_cases: 225
+  model: DeepSeek Chat V2.5
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 5.3
+  pass_rate_2: 17.8
+  pass_num_1: 12
+  pass_num_2: 40
+  percent_cases_well_formed: 92.9
+  error_outputs: 42
+  num_malformed_responses: 37
+  num_with_malformed_responses: 16
+  user_asks: 23
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 5
+  test_timeouts: 5
+  total_tests: 225
+  command: aider --model deepseek/deepseek-chat
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 184.0
+  total_cost: 0.5101
+
+- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
+  test_cases: 225
+  model: claude-3-5-haiku-20241022
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 7.1
+  pass_rate_2: 28.0
+  pass_num_1: 16
+  pass_num_2: 63
+  percent_cases_well_formed: 91.1
+  error_outputs: 31
+  num_malformed_responses: 30
+  num_with_malformed_responses: 20
+  user_asks: 13
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 9
+  total_tests: 225
+  command: aider --model claude-3-5-haiku-20241022
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 31.8
+  total_cost: 6.0583
+
+- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
+  test_cases: 225
+  model: Qwen2.5-Coder-32B-Instruct
+  edit_format: diff
+  commit_hash: 6d7e8be-dirty
+  pass_rate_1: 4.4
+  pass_rate_2: 8.0
+  pass_num_1: 10
+  pass_num_2: 18
+  percent_cases_well_formed: 71.6
+  error_outputs: 158
+  num_malformed_responses: 148
+  num_with_malformed_responses: 64
+  user_asks: 132
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 2
+  total_tests: 225
+  command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 84.4
+  total_cost: 0.0000
+
+- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
+  test_cases: 225
+  model: o1-mini-2024-09-12
+  edit_format: whole
+  commit_hash: 37df899
+  pass_rate_1: 5.8
+  pass_rate_2: 32.9
+  pass_num_1: 13
+  pass_num_2: 74
+  percent_cases_well_formed: 96.9
+  error_outputs: 8
+  num_malformed_responses: 8
+  num_with_malformed_responses: 7
+  user_asks: 27
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  total_tests: 225
+  command: aider --model o1-mini
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 34.7
+  total_cost: 18.5770
+
+- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
+  test_cases: 225
+  model: gemini-exp-1206
+  edit_format: whole
+  commit_hash: b1bc2f8
+  pass_rate_1: 19.6
+  pass_rate_2: 38.2
+  pass_num_1: 44
+  pass_num_2: 86
+  percent_cases_well_formed: 98.2
+  error_outputs: 8
+  num_malformed_responses: 8
+  num_with_malformed_responses: 4
+  user_asks: 32
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 9
+  total_tests: 225
+  command: aider --model gemini/gemini-exp-1206
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 45.5
+  total_cost: 0.0000
+  
+- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
+  test_cases: 225
+  model: gemini-2.0-flash-exp
+  edit_format: whole
+  commit_hash: b1bc2f8
+  pass_rate_1: 11.6
+  pass_rate_2: 22.2
+  pass_num_1: 26
+  pass_num_2: 50
+  percent_cases_well_formed: 100.0
+  error_outputs: 1
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 9
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 8
+  total_tests: 225
+  command: aider --model gemini/gemini-2.0-flash-exp
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 12.2
+  total_cost: 0.0000
--- a/aider/website/_data/polyglot_leaderboard.yml
+++ b/aider/website/_data/polyglot_leaderboard.yml
@ -104,7 +104,7 @@

 - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
  test_cases: 225
-  model: deepseek-chat
+  model: DeepSeek Chat V2.5
  edit_format: diff
  commit_hash: a755079-dirty
  pass_rate_1: 5.3
@ -283,3 +283,30 @@
  versions: 0.69.2.dev
  seconds_per_case: 146.7
  total_cost: 0.0000
+
+- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
+  test_cases: 225
+  model: DeepSeek Chat V3 Preview
+  edit_format: diff
+  commit_hash: 0a23c4a-dirty
+  pass_rate_1: 22.7
+  pass_rate_2: 48.4
+  pass_num_1: 51
+  pass_num_2: 109
+  percent_cases_well_formed: 98.7
+  error_outputs: 7
+  num_malformed_responses: 7
+  num_with_malformed_responses: 3
+  user_asks: 19
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 8
+  total_tests: 225
+  command: aider --model deepseek/deepseek-chat
+  date: 2024-12-25
+  versions: 0.69.2.dev
+  seconds_per_case: 34.8
+  total_cost: 0.3369
+
--- a/aider/website/_posts/2024-12-21-polyglot.md
+++ b/aider/website/_posts/2024-12-21-polyglot.md
@ -21,13 +21,20 @@ new
 other top LLMs.
 The new polyglot benchmark uses many popular coding languages
 and was designed to be 
-*much more challenging* than aider's old
+*much more challenging* than aider's original
 [code editing benchmark](/docs/leaderboards/edit.html).
 This more clearly distinguishes 
 the performance of
 today's strongest coding models and
 leaves headroom for future LLMs.

+{: .note :}
+See the main 
+[aider leaderboard](https://aider.chat/docs/leaderboards/)
+for benchmark results from more models.
+This article only contains a snapshot
+of results at the time of publication.
+
 ## The polyglot benchmark

 Like aider's original code editing benchmark,
@ -171,7 +178,7 @@ on GitHub.
    </tr>
  </thead>
  <tbody>
-    {% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
+    {% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
    {% for row in edit_sorted %}
      <tr style="border-bottom: 1px solid #ddd;">
        <td style="padding: 8px;">{{ row.model }}</td>
--- a/aider/website/docs/leaderboards/index.md
+++ b/aider/website/docs/leaderboards/index.md
@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with
  </tbody>
 </table>

+### Aider polyglot benchmark results
+
 <canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
 <script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script>
 <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 <script>
 {% assign data_source = edit_sorted %}
 {% assign pass_rate_field = "pass_rate_2" %}
+{% assign highlight_model = "xxxxxxxxxxx" %}
 {% include leaderboard.js %}
 </script>
 <style>
--- a/benchmark/docker.sh
+++ b/benchmark/docker.sh
@ -7,6 +7,10 @@ docker run \
       -v `pwd`/tmp.benchmarks/.:/benchmarks \
       -e OPENAI_API_KEY=$OPENAI_API_KEY \
       -e HISTFILE=/aider/.bash_history \
+       -e PROMPT_COMMAND='history -a' \
+       -e HISTCONTROL=ignoredups \
+       -e HISTSIZE=10000 \
+       -e HISTFILESIZE=20000 \
       -e AIDER_DOCKER=1 \
       -e AIDER_BENCHMARK_DIR=/benchmarks \
       aider-benchmark \
--- a/benchmark/rsync.sh
+++ b/benchmark/rsync.sh
@ -25,7 +25,7 @@ rsync -avz --delete \
    "$REPO_ROOT/" \
    "$DEST:~/aider/"

-rsync -a .env .bash_history .gitignore "$DEST:~/aider/."
+rsync -a .env .gitignore "$DEST:~/aider/."

 rsync -a ~/dotfiles/screenrc "$DEST:.screenrc"