diff --git a/aider/models.py b/aider/models.py index f51b38bc4..6470727d0 100644 --- a/aider/models.py +++ b/aider/models.py @@ -178,6 +178,18 @@ MODEL_SETTINGS = [ "whole", weak_model_name="claude-3-haiku-20240307", ), + ModelSettings( + "anthropic/claude-3.5-sonnet", + "diff", + weak_model_name="claude-3-haiku-20240307", + use_repo_map=True, + ), + ModelSettings( + "openrouter/anthropic/claude-3.5-sonnet", + "diff", + weak_model_name="openrouter/anthropic/claude-3-haiku-20240307", + use_repo_map=True, + ), # Cohere ModelSettings( "command-r-plus", diff --git a/website/_data/edit_leaderboard.yml b/website/_data/edit_leaderboard.yml index 82119110a..240e017fd 100644 --- a/website/_data/edit_leaderboard.yml +++ b/website/_data/edit_leaderboard.yml @@ -611,4 +611,51 @@ date: 2024-06-08 versions: 0.37.1-dev seconds_per_case: 280.6 - total_cost: 0.0000 \ No newline at end of file + total_cost: 0.0000 + +- dirname: 2024-06-20-15-09-26--claude-3.5-sonnet-whole + test_cases: 133 + model: claude-3.5-sonnet (whole) + edit_format: whole + commit_hash: 068609e + pass_rate_1: 61.7 + pass_rate_2: 78.2 + percent_cases_well_formed: 100.0 + error_outputs: 4 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model openrouter/anthropic/claude-3.5-sonnet + date: 2024-06-20 + versions: 0.38.1-dev + seconds_per_case: 15.4 + total_cost: 0.0000 + +- dirname: 2024-06-20-15-16-41--claude-3.5-sonnet-diff + test_cases: 133 + model: openrouter/anthropic/claude-3.5-sonnet + edit_format: diff + commit_hash: 068609e-dirty + pass_rate_1: 57.9 + pass_rate_2: 74.4 + percent_cases_well_formed: 97.0 + error_outputs: 48 + num_malformed_responses: 11 + num_with_malformed_responses: 4 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model openrouter/anthropic/claude-3.5-sonnet + date: 2024-06-20 + versions: 0.38.1-dev + seconds_per_case: 21.6 + total_cost: 0.0000 + \ No newline at end of file diff --git a/website/docs/leaderboards/index.md b/website/docs/leaderboards/index.md index 4e3a2a59a..bc6d72c12 100644 --- a/website/docs/leaderboards/index.md +++ b/website/docs/leaderboards/index.md @@ -16,22 +16,16 @@ While [aider can connect to almost any LLM](/docs/llms.html), it works best with models that score well on the benchmarks. -## DeepSeek Coder V2 beats GPT-4o, Opus +## Claude 3.5 Sonnet takes the top spot -The new -[DeepSeek Coder V2](https://aider.chat/docs/llms/deepseek.html) -model is now atop aider's code editing leaderboard! - -It's worth noting that DeepSeek Coder V2 is only capable of using aider's "whole" edit format. -This means it returns a modified full copy of each file when it makes changes. -Most other strong models are able to use aider's "diff" editing format, -which allows them to return diffs of edits -- saving time and token costs. - -Models which use the "whole" edit format can only edit files -which fit within their output token limits. -These output limits are often as low as 4k tokens, even for models -with very large context windows. +Claude 3.5 Sonnet is now the top ranked model on aider's code editing leaderboard. +DeepSeek Coder V2 previously took the #1 spot, only 4 days ago. +Sonnet ranked #1 when using the "whole" editing format, +but it also scored very well with +aider's "diff" editing format. +This format allows it to return code changes as diffs -- saving time and token costs, +and making it practical to work with larger source files. ## Code editing leaderboard