diff --git a/_data/refactor_leaderboard.csv b/_data/refactor_leaderboard.csv deleted file mode 100644 index 6ae17acd8..000000000 --- a/_data/refactor_leaderboard.csv +++ /dev/null @@ -1,6 +0,0 @@ -model,second,first,format,command,version,commits,date -gpt-4-turbo-2024-04-09,0,34.1,udiff,aider --gpt-4-turbo,0.27.1-dev,b75fdb9,4/9/24 -gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24 -gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24 -claude-3-opus-20240229,0,67.4,diff,aider --opus,0.31.2-dev,b02320b-dirty,5/4/24 -gemini/gemini-1.5-pro-latest,0.0,49.4,diff-fenced,aider --model gemini/gemini-1.5-pro-latest,0.31.2-dev,425cb29 1b35ca2-dirty a0649ba-dirty 3e4fca2-dirty,2024-05-04 diff --git a/_data/refactor_leaderboard.yml b/_data/refactor_leaderboard.yml new file mode 100644 index 000000000..01ae9dbf1 --- /dev/null +++ b/_data/refactor_leaderboard.yml @@ -0,0 +1,41 @@ +- dirname: 2024-05-04-23-27-02--refac-gemini + test_cases: 89 + model: gemini/gemini-1.5-pro-latest + edit_format: diff-fenced + commit_hash: a0649ba-dirty, 425cb29, 1b35ca2-dirty, 3e4fca2-dirty + pass_rate_1: 49.4 + percent_cases_well_formed: 7.9 + error_outputs: 247 + num_malformed_responses: 82 + user_asks: 0 + lazy_comments: 4 + syntax_errors: 0 + indentation_errors: 8 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gemini/gemini-1.5-pro-latest + date: 2024-05-04 + versions: 0.31.2-dev + seconds_per_case: 55.7 + total_cost: 0.0000 +- dirname: 2024-05-04-17-45-53--refac-opus + test_cases: 83 + model: openrouter/anthropic/claude-3-opus + edit_format: diff + commit_hash: b02320b-dirty + pass_rate_1: 72.3 + percent_cases_well_formed: 79.5 + error_outputs: 51 + num_malformed_responses: 17 + user_asks: 0 + lazy_comments: 2 + syntax_errors: 1 + indentation_errors: 3 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model openrouter/anthropic/claude-3-opus + date: 2024-05-04 + versions: 0.31.2-dev + seconds_per_case: 67.8 + total_cost: 27.9176 + \ No newline at end of file diff --git a/docs/leaderboards/index.md b/docs/leaderboards/index.md index ad83bfaaa..4d2aed1f2 100644 --- a/docs/leaderboards/index.md +++ b/docs/leaderboards/index.md @@ -99,13 +99,13 @@ Therefore, results are available for fewer models. - {% assign refac_sorted = site.data.refactor_leaderboard | sort: 'first' | reverse %} + {% assign refac_sorted = site.data.refactor_leaderboard | sort: 'pass_rate_1' | reverse %} {% for row in refac_sorted %} {{ row.model }} - {{ row.first }}% + {{ row.pass_rate_1 }}% {{ row.command }} - {{ row.format }} + {{ row.edit_format }} {% endfor %} @@ -129,7 +129,7 @@ Therefore, results are available for fewer models. {% for row in refac_sorted %} leaderboardData.labels.push('{{ row.model }}'); - leaderboardData.datasets[0].data.push({{ row.first }}); + leaderboardData.datasets[0].data.push({{ row.pass_rate_1 }}); {% endfor %} var leaderboardChart = new Chart(ctx, {