diff --git a/_data/edit_leaderboard.csv b/_data/edit_leaderboard.csv deleted file mode 100644 index b5bce1c11..000000000 --- a/_data/edit_leaderboard.csv +++ /dev/null @@ -1,16 +0,0 @@ -model,second,first,format,command,version,commits,date -claude-3-opus-20240229,68.4,53.4,diff,aider --opus,0.30.1,f4b1797,5/2/24 -claude-3-sonnet-20240229,54.9,43.6,whole,aider --sonnet,0.25.0,a5f8076,3/6/24 -Command-R+,29.3,22.6,whole,aider --model command-r-plus,0.28.0,a06c927,4/20/24 -Deepseek Coder,54.5,47,whole,aider --model openai/deepseek-coder,0.30.1,c07f793,4/29/24 -gemini-1.5-pro-latest,57.1,45.9,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.32.0,5d32dd7,5/3/24 -gpt-3.5-turbo-0125,49.6,39.8,whole,aider -3,0.22.0,da14474,2/2/24 -gpt-3.5-turbo-0301,57.9,50.4,whole,aider --model gpt-3.5-turbo-0301,0.16.4-dev,44388db-dirty,11/6/23 -gpt-3.5-turbo-0613,50.4,38.3,whole,aider --model gpt-3.5-turbo-0613,0.16.4-dev,93aa497-dirty,11/7/23 -gpt-3.5-turbo-1106,56.1,45.5,whole,aider --model gpt-3.5-turbo-1106,0.30.1,7b14d77,4/30/24 -gpt-4-0125-preview,66.2,55.6,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,edcf9b1,1/25/24 -gpt-4-0314,66.2,50.4,diff,aider --model gpt-4-0314,0.31.2-dev,0d43468,5/4/24 -gpt-4-0613,67.7,46.6,diff,aider -4,0.18.1,3aa17c4,12/16/23 -gpt-4-1106-preview,63.2,57.1,udiff,aider,0.31.2-dev,1981105-dirty,5/4/24 -gpt-4-turbo-2024-04-09,64.4,49.2,diff,aider --gpt-4-turbo,0.30.1,e610e5b,5/1/24 -Llama3 70B,49.2,38.6,diff,aider --model groq/llama3-70b-8192,0.32.0,b5bb453,5/3/24 \ No newline at end of file diff --git a/_data/edit_leaderboard.yml b/_data/edit_leaderboard.yml new file mode 100644 index 000000000..0890702c8 --- /dev/null +++ b/_data/edit_leaderboard.yml @@ -0,0 +1,316 @@ +- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence + test_cases: 133 + model: openrouter/anthropic/claude-3-opus, claude-3-opus-20240229 + edit_format: diff + commit_hash: f4b1797-dirty, f4b1797 + pass_rate_1: 53.4 + pass_rate_2: 68.4 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --opus + date: 2024-05-01 + versions: 0.30.2-dev + seconds_per_case: 32.4 + total_cost: 13.8395 +- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole + test_cases: 133 + model: claude-3-sonnet-20240229 + edit_format: whole + commit_hash: a5f8076-dirty + pass_rate_1: 43.6 + pass_rate_2: 54.9 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + user_asks: 1 + lazy_comments: 1 + syntax_errors: 2 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 7 + command: aider --sonnet + date: 2024-03-06 + versions: 0.25.1-dev + seconds_per_case: 23.1 + total_cost: 0.0000 +- dirname: 2024-04-29-19-17-28--deepseek-coder-whole + test_cases: 132 + model: openai/deepseek-coder + edit_format: whole + commit_hash: c07f793-dirty + pass_rate_1: 47.0 + pass_rate_2: 54.5 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + user_asks: 0 + lazy_comments: 2 + syntax_errors: 13 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model openai/deepseek-coder + date: 2024-04-29 + versions: 0.30.2-dev + seconds_per_case: 26.7 + total_cost: 0.0000 +- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced + test_cases: 133 + model: gemini/gemini-1.5-pro-latest + edit_format: diff-fenced + commit_hash: 3a48dfb, 5d32dd7 + pass_rate_1: 45.9 + pass_rate_2: 57.1 + percent_cases_well_formed: 87.2 + error_outputs: 60 + num_malformed_responses: 17 + user_asks: 3 + lazy_comments: 0 + syntax_errors: 8 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model gemini/gemini-1.5-pro-latest + date: 2024-05-03 + versions: 0.31.2-dev + seconds_per_case: 21.3 + total_cost: 0.0000 +- dirname: 2024-02-02-02-07-28--exercism-gpt-3.5-turbo-0125-whole + test_cases: 133 + model: gpt-3.5-turbo-0125 + edit_format: whole + commit_hash: da14474 + pass_rate_1: 39.8 + pass_rate_2: 49.6 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 3 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 4 + command: aider --model gpt-3.5-turbo-0125 + date: 2024-02-02 + versions: 0.22.1-dev + seconds_per_case: 3.2 + total_cost: 0.4701 +- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301 + test_cases: 133 + model: gpt-3.5-turbo-0301 + edit_format: whole + commit_hash: 44388db-dirty + pass_rate_1: 50.4 + pass_rate_2: 57.9 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + user_asks: 1 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 8 + command: aider --model gpt-3.5-turbo-0301 + date: 2023-11-06 + versions: 0.16.4-dev + seconds_per_case: 6.5 + total_cost: 0.4822 +- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613 + test_cases: 133 + model: gpt-3.5-turbo-0613 + edit_format: whole + commit_hash: 93aa497-dirty + pass_rate_1: 38.3 + pass_rate_2: 50.4 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + user_asks: 1 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 5 + command: aider --model gpt-3.5-turbo-0613 + date: 2023-11-07 + versions: 0.16.4-dev + seconds_per_case: 18.0 + total_cost: 0.5366 +- dirname: 2024-04-30-21-40-51--litellm-gpt-3.5-turbo-1106-again + test_cases: 132 + model: gpt-3.5-turbo-1106 + edit_format: whole + commit_hash: 7b14d77 + pass_rate_1: 45.5 + pass_rate_2: 56.1 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + user_asks: 1 + lazy_comments: 0 + syntax_errors: 19 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-3.5-turbo-1106 + date: 2024-04-30 + versions: 0.30.2-dev + seconds_per_case: 5.3 + total_cost: 0.3261 +- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff + test_cases: 133 + model: gpt-4-0125-preview + edit_format: udiff + commit_hash: edcf9b1 + pass_rate_1: 55.6 + pass_rate_2: 66.2 + percent_cases_well_formed: 97.7 + error_outputs: 6 + num_malformed_responses: 3 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 3 + indentation_errors: 7 + exhausted_context_windows: 0 + test_timeouts: 4 + command: aider --model gpt-4-0125-preview + date: 2024-01-25 + versions: 0.22.1-dev + seconds_per_case: 44.8 + total_cost: 14.6428 +- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules + test_cases: 133 + model: gpt-4-0314 + edit_format: diff + commit_hash: 0d43468 + pass_rate_1: 50.4 + pass_rate_2: 66.2 + percent_cases_well_formed: 93.2 + error_outputs: 28 + num_malformed_responses: 9 + user_asks: 1 + lazy_comments: 3 + syntax_errors: 9 + indentation_errors: 7 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model gpt-4-0314 + date: 2024-05-04 + versions: 0.31.2-dev + seconds_per_case: 19.8 + total_cost: 16.2689 +- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main + test_cases: 133 + model: gpt-4-0613 + edit_format: diff + commit_hash: 3aa17c4 + pass_rate_1: 46.6 + pass_rate_2: 67.7 + percent_cases_well_formed: 100.0 + error_outputs: 14 + num_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model gpt-4-0613 + date: 2023-12-16 + versions: 0.18.2-dev + seconds_per_case: 33.6 + total_cost: 17.4657 +- dirname: 2024-05-04-14-33-15--redo-gpt-4-1106-preview-udiff5 + test_cases: 133 + model: gpt-4-1106-preview + edit_format: udiff + commit_hash: 1981105-dirty + pass_rate_1: 57.1 + pass_rate_2: 63.2 + percent_cases_well_formed: 94.0 + error_outputs: 24 + num_malformed_responses: 8 + user_asks: 0 + lazy_comments: 7 + syntax_errors: 3 + indentation_errors: 5 + exhausted_context_windows: 0 + test_timeouts: 2 + command: aider --model gpt-4-1106-preview + date: 2024-05-04 + versions: 0.31.2-dev + seconds_per_case: 15.6 + total_cost: 5.9468 +- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples + test_cases: 133 + model: gpt-4-turbo + edit_format: udiff + commit_hash: e610e5b-dirty + pass_rate_1: 48.1 + pass_rate_2: 63.9 + percent_cases_well_formed: 97.0 + error_outputs: 12 + num_malformed_responses: 4 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 4 + indentation_errors: 2 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model gpt-4-turbo + date: 2024-05-01 + versions: 0.30.2-dev + seconds_per_case: 22.8 + total_cost: 6.3337 +- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg + test_cases: 132 + model: openrouter/meta-llama/llama-3-70b-instruct + edit_format: diff + commit_hash: b5bb453 + pass_rate_1: 38.6 + pass_rate_2: 49.2 + percent_cases_well_formed: 73.5 + error_outputs: 105 + num_malformed_responses: 35 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 2 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model openrouter/meta-llama/llama-3-70b-instruct + date: 2024-05-03 + versions: 0.31.2-dev + seconds_per_case: 14.5 + total_cost: 0.4311 +- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final + test_cases: 133 + model: command-r-plus + edit_format: whole + commit_hash: fc3a43e-dirty + pass_rate_1: 21.8 + pass_rate_2: 31.6 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + user_asks: 0 + lazy_comments: 1 + syntax_errors: 5 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 7 + command: aider --model command-r-plus + date: 2024-05-06 + versions: 0.31.2-dev + seconds_per_case: 22.9 + total_cost: 2.7494 + \ No newline at end of file diff --git a/docs/leaderboards/index.md b/docs/leaderboards/index.md index 4d2aed1f2..8340def79 100644 --- a/docs/leaderboards/index.md +++ b/docs/leaderboards/index.md @@ -31,13 +31,13 @@ it will work best with models that score well on the benchmarks. - {% assign edit_sorted = site.data.edit_leaderboard | sort: 'second' | reverse %} + {% assign edit_sorted = site.data.edit_leaderboard | sort: 'pass_rate_2' | reverse %} {% for row in edit_sorted %} {{ row.model }} - {{ row.second }}% + {{ row.pass_rate_2 }}% {{ row.command }} - {{ row.format }} + {{ row.edit_format }} {% endfor %} @@ -61,7 +61,7 @@ it will work best with models that score well on the benchmarks. {% for row in edit_sorted %} leaderboardData.labels.push('{{ row.model }}'); - leaderboardData.datasets[0].data.push({{ row.second }}); + leaderboardData.datasets[0].data.push({{ row.pass_rate_2 }}); {% endfor %} var leaderboardChart = new Chart(ctx, {