diff --git a/aider/website/_data/edit_leaderboard.yml b/aider/website/_data/edit_leaderboard.yml index a2dd8ac9a..c567300d1 100644 --- a/aider/website/_data/edit_leaderboard.yml +++ b/aider/website/_data/edit_leaderboard.yml @@ -2184,7 +2184,7 @@ - dirname: 2024-12-18-01-50-08--o1 test_cases: 133 - model: openrouter/openai/o1 + model: o1 edit_format: diff commit_hash: 074c636-dirty pass_rate_1: 65.4 diff --git a/aider/website/_data/polyglot_leaderboard.yml b/aider/website/_data/polyglot_leaderboard.yml new file mode 100644 index 000000000..9bc818778 --- /dev/null +++ b/aider/website/_data/polyglot_leaderboard.yml @@ -0,0 +1,155 @@ +- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini + test_cases: 225 + model: gpt-4o-mini-2024-07-18 + edit_format: whole + commit_hash: a755079-dirty + pass_rate_1: 0.9 + pass_rate_2: 3.6 + pass_num_1: 2 + pass_num_2: 8 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 36 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + total_tests: 225 + command: aider --model gpt-4o-mini-2024-07-18 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 17.3 + total_cost: 0.3236 + +- dirname: 2024-12-21-18-44-28--polyglot-sonnet + test_cases: 225 + model: claude-3-5-sonnet-20241022 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 18.7 + pass_rate_2: 45.3 + pass_num_1: 42 + pass_num_2: 102 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 14 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 12 + total_tests: 225 + command: aider --model claude-3-5-sonnet-20241022 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 30.8 + total_cost: 13.4847 + +- dirname: 2024-12-21-18-52-34--polyglot-gpt-4o-diff + test_cases: 225 + model: gpt-4o-2024-11-20 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 4.9 + pass_rate_2: 15.1 + pass_num_1: 11 + pass_num_2: 34 + percent_cases_well_formed: 96.0 + error_outputs: 12 + num_malformed_responses: 11 + num_with_malformed_responses: 9 + user_asks: 34 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 19 + total_tests: 225 + command: aider --model gpt-4o-2024-11-20 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 22.2 + total_cost: 7.1835 + +- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff + test_cases: 224 + model: o1-2024-12-17 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 23.7 + pass_rate_2: 61.7 + pass_num_1: 53 + pass_num_2: 139 + percent_cases_well_formed: 91.5 + error_outputs: 25 + num_malformed_responses: 24 + num_with_malformed_responses: 19 + user_asks: 16 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + total_tests: 225 + command: aider --model openrouter/openai/o1 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 133.2 + total_cost: 0.0000 + +- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff + test_cases: 225 + model: deepseek-chat + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 5.3 + pass_rate_2: 17.8 + pass_num_1: 12 + pass_num_2: 40 + percent_cases_well_formed: 92.9 + error_outputs: 42 + num_malformed_responses: 37 + num_with_malformed_responses: 16 + user_asks: 23 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 5 + test_timeouts: 5 + total_tests: 225 + command: aider --model deepseek/deepseek-chat + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 184.0 + total_cost: 0.5101 + +- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff + test_cases: 225 + model: claude-3-5-haiku-20241022 + edit_format: diff + commit_hash: a755079-dirty + pass_rate_1: 7.1 + pass_rate_2: 28.0 + pass_num_1: 16 + pass_num_2: 63 + percent_cases_well_formed: 91.1 + error_outputs: 31 + num_malformed_responses: 30 + num_with_malformed_responses: 20 + user_asks: 13 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 1 + test_timeouts: 9 + total_tests: 225 + command: aider --model claude-3-5-haiku-20241022 + date: 2024-12-21 + versions: 0.69.2.dev + seconds_per_case: 31.8 + total_cost: 6.0583 \ No newline at end of file diff --git a/aider/website/docs/leaderboards/index.md b/aider/website/docs/leaderboards/index.md index f9824889d..75747bdc6 100644 --- a/aider/website/docs/leaderboards/index.md +++ b/aider/website/docs/leaderboards/index.md @@ -18,11 +18,27 @@ The leaderboards report the results from a number of popular LLMs. While [aider can connect to almost any LLM](/docs/llms.html), it works best with models that score well on the benchmarks. -## Code editing leaderboard -[Aider's code editing benchmark](/docs/benchmarks.html#the-benchmark) asks the LLM to edit python source files to complete 133 small coding exercises +{: .note :} +The old +[aider code editing leaderboard](edit.html) +has been replaced by this +new, much more challenging +[polyglot leaderboard](). + +## Polyglot leaderboard + +[Aider's polyglot benchmark](/docs/benchmarks.html#the-benchmark) +asks the LLM to edit source files to complete 225 coding exercises from Exercism. -This measures the LLM's coding ability, and whether it can +It contains exercises in many popular programming languages: +C++, Go, Java, JavaScript, Python and Rust. +The 225 exercises were purposely selected to be the *hardest* +that Exercism offered in those languages, to provide +a strong coding challenge to LLMs. + +This benchmark measures the LLM's coding ability in popular languages, +and whether it can write new code that integrates into existing code. The model also has to successfully apply all its changes to the source file without human intervention. @@ -39,7 +55,7 @@ The model also has to successfully apply all its changes to the source file with
- {% assign edit_sorted = site.data.edit_leaderboard | sort: 'pass_rate_2' | reverse %} + {% assign edit_sorted = site.data.polyglot_leaderboard | sort: 'pass_rate_2' | reverse %} {% for row in edit_sorted %}diff --git a/aider/website/docs/leaderboards/refactor.md b/aider/website/docs/leaderboards/refactor.md index 9c2c6a06d..f78941428 100644 --- a/aider/website/docs/leaderboards/refactor.md +++ b/aider/website/docs/leaderboards/refactor.md @@ -6,7 +6,7 @@ description: Quantitative benchmark of LLM code refactoring skill. --- -## Aider refactoring leaderboard +## Refactoring leaderboard [Aider's refactoring benchmark](https://github.com/Aider-AI/refactor-benchmark) asks the LLM to refactor 89 large methods from large python classes. This is a more challenging benchmark, which tests the model's ability to output long chunks of code without skipping sections or making mistakes. It was developed to provoke and measure [GPT-4 Turbo's "lazy coding" habit](/2023/12/21/unified-diffs.html).