From b51768b08e4afadd819b9b38de584e3f2f5f5858 Mon Sep 17 00:00:00 2001
From: paul-gauthier <69695708+paul-gauthier@users.noreply.github.com>
Date: Mon, 23 Dec 2024 18:01:03 -0800
Subject: [PATCH 1/5] Update 2024-12-21-polyglot.md

---
 aider/website/_posts/2024-12-21-polyglot.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aider/website/_posts/2024-12-21-polyglot.md b/aider/website/_posts/2024-12-21-polyglot.md
index 479038d41..7b81f21fa 100644
--- a/aider/website/_posts/2024-12-21-polyglot.md
+++ b/aider/website/_posts/2024-12-21-polyglot.md
@@ -21,7 +21,7 @@ new
 other top LLMs.
 The new polyglot benchmark uses many popular coding languages
 and was designed to be 
-*much more challenging* than aider's old
+*much more challenging* than aider's original
 [code editing benchmark](/docs/leaderboards/edit.html).
 This more clearly distinguishes 
 the performance of

From 0a23c4abd6fbdd0afb7a46448eb02fda5506021e Mon Sep 17 00:00:00 2001
From: "Paul Gauthier (aider)" <paul@aider.chat>
Date: Tue, 24 Dec 2024 08:03:01 -0500
Subject: [PATCH 2/5] feat: Configure bash history to save commands immediately

---
 benchmark/docker.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmark/docker.sh b/benchmark/docker.sh
index 205552619..3a8e4003c 100755
--- a/benchmark/docker.sh
+++ b/benchmark/docker.sh
@@ -7,6 +7,10 @@ docker run \
        -v `pwd`/tmp.benchmarks/.:/benchmarks \
        -e OPENAI_API_KEY=$OPENAI_API_KEY \
        -e HISTFILE=/aider/.bash_history \
+       -e PROMPT_COMMAND='history -a' \
+       -e HISTCONTROL=ignoredups \
+       -e HISTSIZE=10000 \
+       -e HISTFILESIZE=20000 \
        -e AIDER_DOCKER=1 \
        -e AIDER_BENCHMARK_DIR=/benchmarks \
        aider-benchmark \

From 7537d79311421331fecd034c5a472d3d88af3276 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <paul@aider.chat>
Date: Wed, 25 Dec 2024 08:05:46 -0500
Subject: [PATCH 3/5] fix: Remove .bash_history from rsync

---
 benchmark/rsync.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/rsync.sh b/benchmark/rsync.sh
index 8d1be20eb..0de23c9d0 100755
--- a/benchmark/rsync.sh
+++ b/benchmark/rsync.sh
@@ -25,7 +25,7 @@ rsync -avz --delete \
     "$REPO_ROOT/" \
     "$DEST:~/aider/"
 
-rsync -a .env .bash_history .gitignore "$DEST:~/aider/."
+rsync -a .env .gitignore "$DEST:~/aider/."
 
 rsync -a ~/dotfiles/screenrc "$DEST:.screenrc"
 

From dd9b2a872c2c1a5bb5faf85c1e4398e92ae94c07 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <paul@aider.chat>
Date: Wed, 25 Dec 2024 08:11:04 -0500
Subject: [PATCH 4/5] copy

---
 .../website/_data/o1_polyglot_leaderboard.yml | 259 ++++++++++++++++++
 aider/website/_posts/2024-12-21-polyglot.md   |   9 +-
 2 files changed, 267 insertions(+), 1 deletion(-)
 create mode 100644 aider/website/_data/o1_polyglot_leaderboard.yml

diff --git a/aider/website/_data/o1_polyglot_leaderboard.yml b/aider/website/_data/o1_polyglot_leaderboard.yml
new file mode 100644
index 000000000..9badd7a85
--- /dev/null
+++ b/aider/website/_data/o1_polyglot_leaderboard.yml
@@ -0,0 +1,259 @@
+- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
+  test_cases: 225
+  model: gpt-4o-mini-2024-07-18
+  edit_format: whole
+  commit_hash: a755079-dirty
+  pass_rate_1: 0.9
+  pass_rate_2: 3.6
+  pass_num_1: 2
+  pass_num_2: 8
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 36
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  total_tests: 225
+  command: aider --model gpt-4o-mini-2024-07-18
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 17.3
+  total_cost: 0.3236
+
+- dirname: 2024-12-21-18-44-28--polyglot-sonnet
+  test_cases: 225
+  model: claude-3-5-sonnet-20241022
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 18.7
+  pass_rate_2: 45.3
+  pass_num_1: 42
+  pass_num_2: 102
+  percent_cases_well_formed: 100.0
+  error_outputs: 1
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 14
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 12
+  total_tests: 225
+  command: aider --model claude-3-5-sonnet-20241022
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 30.8
+  total_cost: 13.4847
+
+- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff
+  test_cases: 225
+  model: gpt-4o-2024-11-20
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 4.9
+  pass_rate_2: 15.1
+  pass_num_1: 11
+  pass_num_2: 34
+  percent_cases_well_formed: 96.0
+  error_outputs: 12
+  num_malformed_responses: 11
+  num_with_malformed_responses: 9
+  user_asks: 34
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 19
+  total_tests: 225
+  command: aider --model gpt-4o-2024-11-20
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 22.2
+  total_cost: 7.1835
+
+- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
+  test_cases: 224
+  model: o1-2024-12-17 (high)
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 23.7
+  pass_rate_2: 61.7
+  pass_num_1: 53
+  pass_num_2: 139
+  percent_cases_well_formed: 91.5
+  error_outputs: 25
+  num_malformed_responses: 24
+  num_with_malformed_responses: 19
+  user_asks: 16
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 2
+  total_tests: 225
+  command: aider --model openrouter/openai/o1
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 133.2
+  total_cost: 0.0000
+
+- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
+  test_cases: 225
+  model: deepseek-chat
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 5.3
+  pass_rate_2: 17.8
+  pass_num_1: 12
+  pass_num_2: 40
+  percent_cases_well_formed: 92.9
+  error_outputs: 42
+  num_malformed_responses: 37
+  num_with_malformed_responses: 16
+  user_asks: 23
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 5
+  test_timeouts: 5
+  total_tests: 225
+  command: aider --model deepseek/deepseek-chat
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 184.0
+  total_cost: 0.5101
+
+- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
+  test_cases: 225
+  model: claude-3-5-haiku-20241022
+  edit_format: diff
+  commit_hash: a755079-dirty
+  pass_rate_1: 7.1
+  pass_rate_2: 28.0
+  pass_num_1: 16
+  pass_num_2: 63
+  percent_cases_well_formed: 91.1
+  error_outputs: 31
+  num_malformed_responses: 30
+  num_with_malformed_responses: 20
+  user_asks: 13
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 9
+  total_tests: 225
+  command: aider --model claude-3-5-haiku-20241022
+  date: 2024-12-21
+  versions: 0.69.2.dev
+  seconds_per_case: 31.8
+  total_cost: 6.0583
+
+- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
+  test_cases: 225
+  model: Qwen2.5-Coder-32B-Instruct
+  edit_format: diff
+  commit_hash: 6d7e8be-dirty
+  pass_rate_1: 4.4
+  pass_rate_2: 8.0
+  pass_num_1: 10
+  pass_num_2: 18
+  percent_cases_well_formed: 71.6
+  error_outputs: 158
+  num_malformed_responses: 148
+  num_with_malformed_responses: 64
+  user_asks: 132
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 2
+  total_tests: 225
+  command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 84.4
+  total_cost: 0.0000
+
+- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
+  test_cases: 225
+  model: o1-mini-2024-09-12
+  edit_format: whole
+  commit_hash: 37df899
+  pass_rate_1: 5.8
+  pass_rate_2: 32.9
+  pass_num_1: 13
+  pass_num_2: 74
+  percent_cases_well_formed: 96.9
+  error_outputs: 8
+  num_malformed_responses: 8
+  num_with_malformed_responses: 7
+  user_asks: 27
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 3
+  total_tests: 225
+  command: aider --model o1-mini
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 34.7
+  total_cost: 18.5770
+
+- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
+  test_cases: 225
+  model: gemini-exp-1206
+  edit_format: whole
+  commit_hash: b1bc2f8
+  pass_rate_1: 19.6
+  pass_rate_2: 38.2
+  pass_num_1: 44
+  pass_num_2: 86
+  percent_cases_well_formed: 98.2
+  error_outputs: 8
+  num_malformed_responses: 8
+  num_with_malformed_responses: 4
+  user_asks: 32
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 9
+  total_tests: 225
+  command: aider --model gemini/gemini-exp-1206
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 45.5
+  total_cost: 0.0000
+  
+- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
+  test_cases: 225
+  model: gemini-2.0-flash-exp
+  edit_format: whole
+  commit_hash: b1bc2f8
+  pass_rate_1: 11.6
+  pass_rate_2: 22.2
+  pass_num_1: 26
+  pass_num_2: 50
+  percent_cases_well_formed: 100.0
+  error_outputs: 1
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 9
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 1
+  test_timeouts: 8
+  total_tests: 225
+  command: aider --model gemini/gemini-2.0-flash-exp
+  date: 2024-12-22
+  versions: 0.69.2.dev
+  seconds_per_case: 12.2
+  total_cost: 0.0000
\ No newline at end of file
diff --git a/aider/website/_posts/2024-12-21-polyglot.md b/aider/website/_posts/2024-12-21-polyglot.md
index 7b81f21fa..4b2f9bdc7 100644
--- a/aider/website/_posts/2024-12-21-polyglot.md
+++ b/aider/website/_posts/2024-12-21-polyglot.md
@@ -28,6 +28,13 @@ the performance of
 today's strongest coding models and
 leaves headroom for future LLMs.
 
+{: .note :}
+See the main 
+[aider leaderboard](https://aider.chat/docs/leaderboards/)
+for benchmark results from more models.
+This article only contains a snapshot
+of results at the time of publication.
+
 ## The polyglot benchmark
 
 Like aider's original code editing benchmark,
@@ -171,7 +178,7 @@ on GitHub.
     </tr>
   </thead>
   <tbody>
-    {% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
+    {% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %}
     {% for row in edit_sorted %}
       <tr style="border-bottom: 1px solid #ddd;">
         <td style="padding: 8px;">{{ row.model }}</td>

From ec2da0a399472250424c390aab2a88685521b835 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <paul@aider.chat>
Date: Wed, 25 Dec 2024 09:01:43 -0500
Subject: [PATCH 5/5] add deepseek v3

---
 .../website/_data/o1_polyglot_leaderboard.yml |  2 +-
 aider/website/_data/polyglot_leaderboard.yml  | 30 +++++++++++++++++--
 aider/website/docs/leaderboards/index.md      |  3 ++
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/aider/website/_data/o1_polyglot_leaderboard.yml b/aider/website/_data/o1_polyglot_leaderboard.yml
index 9badd7a85..20e8102ad 100644
--- a/aider/website/_data/o1_polyglot_leaderboard.yml
+++ b/aider/website/_data/o1_polyglot_leaderboard.yml
@@ -104,7 +104,7 @@
 
 - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
   test_cases: 225
-  model: deepseek-chat
+  model: DeepSeek Chat V2.5
   edit_format: diff
   commit_hash: a755079-dirty
   pass_rate_1: 5.3
diff --git a/aider/website/_data/polyglot_leaderboard.yml b/aider/website/_data/polyglot_leaderboard.yml
index 9badd7a85..b841a1f0c 100644
--- a/aider/website/_data/polyglot_leaderboard.yml
+++ b/aider/website/_data/polyglot_leaderboard.yml
@@ -104,7 +104,7 @@
 
 - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
   test_cases: 225
-  model: deepseek-chat
+  model: DeepSeek Chat V2.5
   edit_format: diff
   commit_hash: a755079-dirty
   pass_rate_1: 5.3
@@ -256,4 +256,30 @@
   date: 2024-12-22
   versions: 0.69.2.dev
   seconds_per_case: 12.2
-  total_cost: 0.0000
\ No newline at end of file
+  total_cost: 0.0000
+
+- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
+  test_cases: 225
+  model: DeepSeek Chat V3 Preview
+  edit_format: diff
+  commit_hash: 0a23c4a-dirty
+  pass_rate_1: 22.7
+  pass_rate_2: 48.4
+  pass_num_1: 51
+  pass_num_2: 109
+  percent_cases_well_formed: 98.7
+  error_outputs: 7
+  num_malformed_responses: 7
+  num_with_malformed_responses: 3
+  user_asks: 19
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 8
+  total_tests: 225
+  command: aider --model deepseek/deepseek-chat
+  date: 2024-12-25
+  versions: 0.69.2.dev
+  seconds_per_case: 34.8
+  total_cost: 0.3369
\ No newline at end of file
diff --git a/aider/website/docs/leaderboards/index.md b/aider/website/docs/leaderboards/index.md
index ce0b826ff..7b308ddd0 100644
--- a/aider/website/docs/leaderboards/index.md
+++ b/aider/website/docs/leaderboards/index.md
@@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with
   </tbody>
 </table>
 
+### Aider polyglot benchmark results
+
 <canvas id="editChart" width="800" height="450" style="margin-top: 20px"></canvas>
 <script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script>
 <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 <script>
 {% assign data_source = edit_sorted %}
 {% assign pass_rate_field = "pass_rate_2" %}
+{% assign highlight_model = "xxxxxxxxxxx" %}
 {% include leaderboard.js %}
 </script>
 <style>