From b51768b08e4afadd819b9b38de584e3f2f5f5858 Mon Sep 17 00:00:00 2001 From: paul-gauthier <69695708+paul-gauthier@users.noreply.github.com> Date: Mon, 23 Dec 2024 18:01:03 -0800 Subject: [PATCH 1/5] Update 2024-12-21-polyglot.md --- aider/website/_posts/2024-12-21-polyglot.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aider/website/_posts/2024-12-21-polyglot.md b/aider/website/_posts/2024-12-21-polyglot.md index 479038d41..7b81f21fa 100644 --- a/aider/website/_posts/2024-12-21-polyglot.md +++ b/aider/website/_posts/2024-12-21-polyglot.md @@ -21,7 +21,7 @@ new other top LLMs. The new polyglot benchmark uses many popular coding languages and was designed to be -*much more challenging* than aider's old +*much more challenging* than aider's original [code editing benchmark](/docs/leaderboards/edit.html). This more clearly distinguishes the performance of From 0a23c4abd6fbdd0afb7a46448eb02fda5506021e Mon Sep 17 00:00:00 2001 From: "Paul Gauthier (aider)" Date: Tue, 24 Dec 2024 08:03:01 -0500 Subject: [PATCH 2/5] feat: Configure bash history to save commands immediately --- benchmark/docker.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmark/docker.sh b/benchmark/docker.sh index 205552619..3a8e4003c 100755 --- a/benchmark/docker.sh +++ b/benchmark/docker.sh @@ -7,6 +7,10 @@ docker run \ -v `pwd`/tmp.benchmarks/.:/benchmarks \ -e OPENAI_API_KEY=$OPENAI_API_KEY \ -e HISTFILE=/aider/.bash_history \ + -e PROMPT_COMMAND='history -a' \ + -e HISTCONTROL=ignoredups \ + -e HISTSIZE=10000 \ + -e HISTFILESIZE=20000 \ -e AIDER_DOCKER=1 \ -e AIDER_BENCHMARK_DIR=/benchmarks \ aider-benchmark \ From 7537d79311421331fecd034c5a472d3d88af3276 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 25 Dec 2024 08:05:46 -0500 Subject: [PATCH 3/5] fix: Remove .bash_history from rsync --- benchmark/rsync.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/rsync.sh b/benchmark/rsync.sh index 8d1be20eb..0de23c9d0 100755 --- a/benchmark/rsync.sh +++ b/benchmark/rsync.sh @@ -25,7 +25,7 @@ rsync -avz --delete \ "$REPO_ROOT/" \ "$DEST:~/aider/" -rsync -a .env .bash_history .gitignore "$DEST:~/aider/." +rsync -a .env .gitignore "$DEST:~/aider/." rsync -a ~/dotfiles/screenrc "$DEST:.screenrc" From dd9b2a872c2c1a5bb5faf85c1e4398e92ae94c07 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 25 Dec 2024 08:11:04 -0500 Subject: [PATCH 4/5] copy --- .../website/_data/o1_polyglot_leaderboard.yml | 259 ++++++++++++++++++ aider/website/_posts/2024-12-21-polyglot.md | 9 +- 2 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 aider/website/_data/o1_polyglot_leaderboard.yml diff --git a/aider/website/_data/o1_polyglot_leaderboard.yml b/aider/website/_data/o1_polyglot_leaderboard.yml new file mode 100644 index 000000000..9badd7a85 --- /dev/null +++ b/aider/website/_data/o1_polyglot_leaderboard.yml @@ -0,0 +1,259 @@ +- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini + test_cases: 225 + model: gpt-4o-mini-2024-07-18 + edit_format: whole + commit_hash: a755079-dirty + pass_rate_1: 0.9 + pass_rate_2: 3.6 + pass_num_1: 2 + pass_num_2: 8 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 36 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + total_tests: 225 + command: aider --model gpt-4o-mini-2024-07-18 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 17.3 + total_cost: 0.3236 + +- dirname: 2024-12-21-18-44-28--polyglot-sonnet + test_cases: 225 + model: claude-3-5-sonnet-20241022 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 18.7 + pass_rate_2: 45.3 + pass_num_1: 42 + pass_num_2: 102 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 14 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 12 + total_tests: 225 + command: aider --model claude-3-5-sonnet-20241022 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 30.8 + total_cost: 13.4847 + +- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff + test_cases: 225 + model: gpt-4o-2024-11-20 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 4.9 + pass_rate_2: 15.1 + pass_num_1: 11 + pass_num_2: 34 + percent_cases_well_formed: 96.0 + error_outputs: 12 + num_malformed_responses: 11 + num_with_malformed_responses: 9 + user_asks: 34 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 19 + total_tests: 225 + command: aider --model gpt-4o-2024-11-20 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 22.2 + total_cost: 7.1835 + +- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff + test_cases: 224 + model: o1-2024-12-17 (high) + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 23.7 + pass_rate_2: 61.7 + pass_num_1: 53 + pass_num_2: 139 + percent_cases_well_formed: 91.5 + error_outputs: 25 + num_malformed_responses: 24 + num_with_malformed_responses: 19 + user_asks: 16 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + total_tests: 225 + command: aider --model openrouter/openai/o1 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 133.2 + total_cost: 0.0000 + +- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff + test_cases: 225 + model: deepseek-chat + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 5.3 + pass_rate_2: 17.8 + pass_num_1: 12 + pass_num_2: 40 + percent_cases_well_formed: 92.9 + error_outputs: 42 + num_malformed_responses: 37 + num_with_malformed_responses: 16 + user_asks: 23 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 5 + test_timeouts: 5 + total_tests: 225 + command: aider --model deepseek/deepseek-chat + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 184.0 + total_cost: 0.5101 + +- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff + test_cases: 225 + model: claude-3-5-haiku-20241022 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 7.1 + pass_rate_2: 28.0 + pass_num_1: 16 + pass_num_2: 63 + percent_cases_well_formed: 91.1 + error_outputs: 31 + num_malformed_responses: 30 + num_with_malformed_responses: 20 + user_asks: 13 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 9 + total_tests: 225 + command: aider --model claude-3-5-haiku-20241022 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 31.8 + total_cost: 6.0583 + +- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff + test_cases: 225 + model: Qwen2.5-Coder-32B-Instruct + edit_format: diff + commit_hash: 6d7e8be-dirty + pass_rate_1: 4.4 + pass_rate_2: 8.0 + pass_num_1: 10 + pass_num_2: 18 + percent_cases_well_formed: 71.6 + error_outputs: 158 + num_malformed_responses: 148 + num_with_malformed_responses: 64 + user_asks: 132 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 2 + total_tests: 225 + command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic" + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 84.4 + total_cost: 0.0000 + +- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole + test_cases: 225 + model: o1-mini-2024-09-12 + edit_format: whole + commit_hash: 37df899 + pass_rate_1: 5.8 + pass_rate_2: 32.9 + pass_num_1: 13 + pass_num_2: 74 + percent_cases_well_formed: 96.9 + error_outputs: 8 + num_malformed_responses: 8 + num_with_malformed_responses: 7 + user_asks: 27 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + total_tests: 225 + command: aider --model o1-mini + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 34.7 + total_cost: 18.5770 + +- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2 + test_cases: 225 + model: gemini-exp-1206 + edit_format: whole + commit_hash: b1bc2f8 + pass_rate_1: 19.6 + pass_rate_2: 38.2 + pass_num_1: 44 + pass_num_2: 86 + percent_cases_well_formed: 98.2 + error_outputs: 8 + num_malformed_responses: 8 + num_with_malformed_responses: 4 + user_asks: 32 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 9 + total_tests: 225 + command: aider --model gemini/gemini-exp-1206 + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 45.5 + total_cost: 0.0000 + +- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole + test_cases: 225 + model: gemini-2.0-flash-exp + edit_format: whole + commit_hash: b1bc2f8 + pass_rate_1: 11.6 + pass_rate_2: 22.2 + pass_num_1: 26 + pass_num_2: 50 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 9 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 8 + total_tests: 225 + command: aider --model gemini/gemini-2.0-flash-exp + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 12.2 + total_cost: 0.0000 \ No newline at end of file diff --git a/aider/website/_posts/2024-12-21-polyglot.md b/aider/website/_posts/2024-12-21-polyglot.md index 7b81f21fa..4b2f9bdc7 100644 --- a/aider/website/_posts/2024-12-21-polyglot.md +++ b/aider/website/_posts/2024-12-21-polyglot.md @@ -28,6 +28,13 @@ the performance of today's strongest coding models and leaves headroom for future LLMs. +{: .note :} +See the main +[aider leaderboard](https://aider.chat/docs/leaderboards/) +for benchmark results from more models. +This article only contains a snapshot +of results at the time of publication. + ## The polyglot benchmark Like aider's original code editing benchmark, @@ -171,7 +178,7 @@ on GitHub. - {% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %} + {% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %} {% for row in edit_sorted %} {{ row.model }} From ec2da0a399472250424c390aab2a88685521b835 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 25 Dec 2024 09:01:43 -0500 Subject: [PATCH 5/5] add deepseek v3 --- .../website/_data/o1_polyglot_leaderboard.yml | 2 +- aider/website/_data/polyglot_leaderboard.yml | 30 +++++++++++++++++-- aider/website/docs/leaderboards/index.md | 3 ++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/aider/website/_data/o1_polyglot_leaderboard.yml b/aider/website/_data/o1_polyglot_leaderboard.yml index 9badd7a85..20e8102ad 100644 --- a/aider/website/_data/o1_polyglot_leaderboard.yml +++ b/aider/website/_data/o1_polyglot_leaderboard.yml @@ -104,7 +104,7 @@ - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff test_cases: 225 - model: deepseek-chat + model: DeepSeek Chat V2.5 edit_format: diff commit_hash: a755079-dirty pass_rate_1: 5.3 diff --git a/aider/website/_data/polyglot_leaderboard.yml b/aider/website/_data/polyglot_leaderboard.yml index 9badd7a85..b841a1f0c 100644 --- a/aider/website/_data/polyglot_leaderboard.yml +++ b/aider/website/_data/polyglot_leaderboard.yml @@ -104,7 +104,7 @@ - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff test_cases: 225 - model: deepseek-chat + model: DeepSeek Chat V2.5 edit_format: diff commit_hash: a755079-dirty pass_rate_1: 5.3 @@ -256,4 +256,30 @@ date: 2024-12-22 versions: 0.69.2.dev seconds_per_case: 12.2 - total_cost: 0.0000 \ No newline at end of file + total_cost: 0.0000 + +- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2 + test_cases: 225 + model: DeepSeek Chat V3 Preview + edit_format: diff + commit_hash: 0a23c4a-dirty + pass_rate_1: 22.7 + pass_rate_2: 48.4 + pass_num_1: 51 + pass_num_2: 109 + percent_cases_well_formed: 98.7 + error_outputs: 7 + num_malformed_responses: 7 + num_with_malformed_responses: 3 + user_asks: 19 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 8 + total_tests: 225 + command: aider --model deepseek/deepseek-chat + date: 2024-12-25 + versions: 0.69.2.dev + seconds_per_case: 34.8 + total_cost: 0.3369 \ No newline at end of file diff --git a/aider/website/docs/leaderboards/index.md b/aider/website/docs/leaderboards/index.md index ce0b826ff..7b308ddd0 100644 --- a/aider/website/docs/leaderboards/index.md +++ b/aider/website/docs/leaderboards/index.md @@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with +### Aider polyglot benchmark results +