diff --git a/aider/website/_data/o1_polyglot_leaderboard.yml b/aider/website/_data/o1_polyglot_leaderboard.yml new file mode 100644 index 000000000..9badd7a85 --- /dev/null +++ b/aider/website/_data/o1_polyglot_leaderboard.yml @@ -0,0 +1,259 @@ +- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini + test_cases: 225 + model: gpt-4o-mini-2024-07-18 + edit_format: whole + commit_hash: a755079-dirty + pass_rate_1: 0.9 + pass_rate_2: 3.6 + pass_num_1: 2 + pass_num_2: 8 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 36 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + total_tests: 225 + command: aider --model gpt-4o-mini-2024-07-18 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 17.3 + total_cost: 0.3236 + +- dirname: 2024-12-21-18-44-28--polyglot-sonnet + test_cases: 225 + model: claude-3-5-sonnet-20241022 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 18.7 + pass_rate_2: 45.3 + pass_num_1: 42 + pass_num_2: 102 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 14 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 12 + total_tests: 225 + command: aider --model claude-3-5-sonnet-20241022 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 30.8 + total_cost: 13.4847 + +- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff + test_cases: 225 + model: gpt-4o-2024-11-20 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 4.9 + pass_rate_2: 15.1 + pass_num_1: 11 + pass_num_2: 34 + percent_cases_well_formed: 96.0 + error_outputs: 12 + num_malformed_responses: 11 + num_with_malformed_responses: 9 + user_asks: 34 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 19 + total_tests: 225 + command: aider --model gpt-4o-2024-11-20 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 22.2 + total_cost: 7.1835 + +- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff + test_cases: 224 + model: o1-2024-12-17 (high) + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 23.7 + pass_rate_2: 61.7 + pass_num_1: 53 + pass_num_2: 139 + percent_cases_well_formed: 91.5 + error_outputs: 25 + num_malformed_responses: 24 + num_with_malformed_responses: 19 + user_asks: 16 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + total_tests: 225 + command: aider --model openrouter/openai/o1 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 133.2 + total_cost: 0.0000 + +- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff + test_cases: 225 + model: deepseek-chat + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 5.3 + pass_rate_2: 17.8 + pass_num_1: 12 + pass_num_2: 40 + percent_cases_well_formed: 92.9 + error_outputs: 42 + num_malformed_responses: 37 + num_with_malformed_responses: 16 + user_asks: 23 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 5 + test_timeouts: 5 + total_tests: 225 + command: aider --model deepseek/deepseek-chat + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 184.0 + total_cost: 0.5101 + +- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff + test_cases: 225 + model: claude-3-5-haiku-20241022 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 7.1 + pass_rate_2: 28.0 + pass_num_1: 16 + pass_num_2: 63 + percent_cases_well_formed: 91.1 + error_outputs: 31 + num_malformed_responses: 30 + num_with_malformed_responses: 20 + user_asks: 13 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 9 + total_tests: 225 + command: aider --model claude-3-5-haiku-20241022 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 31.8 + total_cost: 6.0583 + +- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff + test_cases: 225 + model: Qwen2.5-Coder-32B-Instruct + edit_format: diff + commit_hash: 6d7e8be-dirty + pass_rate_1: 4.4 + pass_rate_2: 8.0 + pass_num_1: 10 + pass_num_2: 18 + percent_cases_well_formed: 71.6 + error_outputs: 158 + num_malformed_responses: 148 + num_with_malformed_responses: 64 + user_asks: 132 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 2 + total_tests: 225 + command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic" + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 84.4 + total_cost: 0.0000 + +- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole + test_cases: 225 + model: o1-mini-2024-09-12 + edit_format: whole + commit_hash: 37df899 + pass_rate_1: 5.8 + pass_rate_2: 32.9 + pass_num_1: 13 + pass_num_2: 74 + percent_cases_well_formed: 96.9 + error_outputs: 8 + num_malformed_responses: 8 + num_with_malformed_responses: 7 + user_asks: 27 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + total_tests: 225 + command: aider --model o1-mini + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 34.7 + total_cost: 18.5770 + +- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2 + test_cases: 225 + model: gemini-exp-1206 + edit_format: whole + commit_hash: b1bc2f8 + pass_rate_1: 19.6 + pass_rate_2: 38.2 + pass_num_1: 44 + pass_num_2: 86 + percent_cases_well_formed: 98.2 + error_outputs: 8 + num_malformed_responses: 8 + num_with_malformed_responses: 4 + user_asks: 32 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 9 + total_tests: 225 + command: aider --model gemini/gemini-exp-1206 + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 45.5 + total_cost: 0.0000 + +- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole + test_cases: 225 + model: gemini-2.0-flash-exp + edit_format: whole + commit_hash: b1bc2f8 + pass_rate_1: 11.6 + pass_rate_2: 22.2 + pass_num_1: 26 + pass_num_2: 50 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 9 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 8 + total_tests: 225 + command: aider --model gemini/gemini-2.0-flash-exp + date: 2024-12-22 + versions: 0.69.2.dev + seconds_per_case: 12.2 + total_cost: 0.0000 \ No newline at end of file diff --git a/aider/website/_posts/2024-12-21-polyglot.md b/aider/website/_posts/2024-12-21-polyglot.md index 7b81f21fa..4b2f9bdc7 100644 --- a/aider/website/_posts/2024-12-21-polyglot.md +++ b/aider/website/_posts/2024-12-21-polyglot.md @@ -28,6 +28,13 @@ the performance of today's strongest coding models and leaves headroom for future LLMs. +{: .note :} +See the main +[aider leaderboard](https://aider.chat/docs/leaderboards/) +for benchmark results from more models. +This article only contains a snapshot +of results at the time of publication. + ## The polyglot benchmark Like aider's original code editing benchmark, @@ -171,7 +178,7 @@ on GitHub. - {% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %} + {% assign edit_sorted = site.data.o1_polyglot_leaderboard | sort: 'pass_rate_2' | reverse %} {% for row in edit_sorted %} {{ row.model }}