From e524dd92033d9049af06c9acf0797c17735dc8df Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Sat, 4 May 2024 11:05:32 -0700 Subject: [PATCH] added refac leaderboard --- .../{leaderboard.csv => edit_leaderboard.csv} | 0 _data/refactor_leaderboard.csv | 16 ++++ docs/leaderboard.md | 85 +++++++++++++++++-- 3 files changed, 92 insertions(+), 9 deletions(-) rename _data/{leaderboard.csv => edit_leaderboard.csv} (100%) create mode 100644 _data/refactor_leaderboard.csv diff --git a/_data/leaderboard.csv b/_data/edit_leaderboard.csv similarity index 100% rename from _data/leaderboard.csv rename to _data/edit_leaderboard.csv diff --git a/_data/refactor_leaderboard.csv b/_data/refactor_leaderboard.csv new file mode 100644 index 000000000..a9ff46996 --- /dev/null +++ b/_data/refactor_leaderboard.csv @@ -0,0 +1,16 @@ +model,second,first,format,command,version,commits,date +gpt-4-turbo-2024-04-09,0,34.1,udiff,aider --gpt-4-turbo,0.27.1-dev,b75fdb9,4/9/24 +gpt-4-0125-preview,0,43.8,udiff,aider --model gpt-4-0125-preview,0.22.1-dev,0fbd702,1/25/24 +gpt-4-1106-preview,0,57.3,udiff,aider --model gpt-4-1106-preview,0.22.1-dev,a75e7c8,1/25/24 +openrouter/anthropic/claude-3-opus,0,67.4,diff,aider --model openrouter/anthropic/claude-3-opus,0.31.2-dev,b02320b-dirty,5/4/24 +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, +,,,,,,, \ No newline at end of file diff --git a/docs/leaderboard.md b/docs/leaderboard.md index 4fb26775e..aa0a0cb01 100644 --- a/docs/leaderboard.md +++ b/docs/leaderboard.md @@ -1,18 +1,21 @@ -# Aider's LLM leaderboard +# Aider's LLM leaderboards Aider works best with LLMs which are good at *editing* code, not just good at writing code. Aider works with the LLM to make changes to the existing code in your git repo, so the LLM needs to be capable of reliably specifying how to edit code. -Aider uses a -[code editing benchmark](https://aider.chat/docs/benchmarks.html#the-benchmark) -to measure an LLM's code editing ability. -This table reports the results from a number of popular LLMs, +Aider uses two benchmarks +to measure an LLM's code editing ability: + +- The [code editing benchmark](https://aider.chat/docs/benchmarks.html#the-benchmark) asks the LLM to edit python source files to complete 133 Exercism exercises. +- The [refactoring benchmark](https://github.com/paul-gauthier/refactor-benchmark) asks the LLM to refactor large methods from a large python source file. This is a more challenging benchmark, which tests the model's ability to output long chunks of code without skipping sections. + +These leaderboards report the results from a number of popular LLMs, to help users select which models to use with aider. While [aider can connect to almost any LLM](https://aider.chat/docs/llms.html) -it will work best with models that score well on the code editing benchmark. +it will work best with models that score well on the benchmarks. ## Code editing leaderboard @@ -26,7 +29,7 @@ it will work best with models that score well on the code editing benchmark. - {% assign sorted = site.data.leaderboard | sort: 'second' | reverse %} + {% assign sorted = site.data.edit_leaderboard | sort: 'second' | reverse %} {% for row in sorted %} {{ row.model }} @@ -46,7 +49,7 @@ it will work best with models that score well on the code editing benchmark. var leaderboardData = { labels: [], datasets: [{ - label: 'Percent correct', + label: 'Percent correct on code editing tasks', data: [], backgroundColor: 'rgba(54, 162, 235, 0.2)', borderColor: 'rgba(54, 162, 235, 1)', @@ -78,9 +81,73 @@ it will work best with models that score well on the code editing benchmark. }); +## Code refactoring leaderboard + + + + + + + + + + + + {% assign sorted = site.data.refactor_leaderboard | sort: 'first' | reverse %} + {% for row in sorted %} + + + + + + + {% endfor %} + +
ModelPercent correctCommandEdit format
{{ row.model }}{{ row.first }}%{{ row.command }}{{ row.format }}
+ + + + -## Edit format + +## Notes on the edit format Aider uses different "edit formats" to collect code edits from different LLMs. The "whole" format is the easiest for an LLM to use, but it uses a lot of tokens