Merge branch 'main' into swe-bench

This commit is contained in:
Paul Gauthier 2024-05-15 12:02:00 -07:00
commit df84bcf38b
15 changed files with 1945 additions and 8 deletions

3
.gitignore vendored
View file

@ -2,3 +2,6 @@
.aider* .aider*
aider_chat.egg-info/ aider_chat.egg-info/
build build
Gemfile.lock
_site
.jekyll-cache/

View file

@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This
helps us to keep track of them and discuss potential solutions or helps us to keep track of them and discuss potential solutions or
enhancements. enhancements.
LLM Benchmark Results
---------------------
Contributions of
[LLM benchmark results](https://aider.chat/docs/leaderboards/)
are welcome!
See the
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
for information on running aider's code editing benchmarks.
Submit results by opening a PR with edits to the
[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
Pull Requests Pull Requests
------------- -------------

5
Gemfile Normal file
View file

@ -0,0 +1,5 @@
source 'https://rubygems.org'
gem 'jekyll'
gem 'jekyll-theme-cayman'
gem 'jekyll-redirect-from'
gem 'jekyll-sitemap'

View file

@ -11,3 +11,6 @@ defaults:
type: "pages" type: "pages"
values: values:
description: "A command-line chat tool for coding with GPT" description: "A command-line chat tool for coding with GPT"
exclude:
- tmp.benchmarks

View file

@ -1,6 +1,7 @@
- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence - dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
test_cases: 133 test_cases: 133
model: claude-3-opus-20240229 model: claude-3-opus-20240229
released: 2024-02-29
edit_format: diff edit_format: diff
commit_hash: f4b1797-dirty, f4b1797 commit_hash: f4b1797-dirty, f4b1797
pass_rate_1: 53.4 pass_rate_1: 53.4
@ -19,9 +20,11 @@
versions: 0.30.2-dev versions: 0.30.2-dev
seconds_per_case: 32.4 seconds_per_case: 32.4
total_cost: 13.8395 total_cost: 13.8395
- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole - dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
test_cases: 133 test_cases: 133
model: claude-3-sonnet-20240229 model: claude-3-sonnet-20240229
released: 2024-02-29
edit_format: whole edit_format: whole
commit_hash: a5f8076-dirty commit_hash: a5f8076-dirty
pass_rate_1: 43.6 pass_rate_1: 43.6
@ -40,9 +43,11 @@
versions: 0.25.1-dev versions: 0.25.1-dev
seconds_per_case: 23.1 seconds_per_case: 23.1
total_cost: 0.0000 total_cost: 0.0000
- dirname: 2024-04-29-19-17-28--deepseek-coder-whole - dirname: 2024-04-29-19-17-28--deepseek-coder-whole
test_cases: 132 test_cases: 132
model: deepseek-coder model: deepseek-coder
released: 2024-01-25
edit_format: whole edit_format: whole
commit_hash: c07f793-dirty commit_hash: c07f793-dirty
pass_rate_1: 47.0 pass_rate_1: 47.0
@ -61,6 +66,7 @@
versions: 0.30.2-dev versions: 0.30.2-dev
seconds_per_case: 26.7 seconds_per_case: 26.7
total_cost: 0.0000 total_cost: 0.0000
- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced - dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
test_cases: 133 test_cases: 133
model: gemini-1.5-pro-latest model: gemini-1.5-pro-latest
@ -86,6 +92,7 @@
- dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole - dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
test_cases: 133 test_cases: 133
model: gpt-3.5-turbo-0125 model: gpt-3.5-turbo-0125
released: 2024-01-25
edit_format: whole edit_format: whole
commit_hash: 1d55f74 commit_hash: 1d55f74
pass_rate_1: 41.4 pass_rate_1: 41.4
@ -108,6 +115,7 @@
- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301 - dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
test_cases: 133 test_cases: 133
model: gpt-3.5-turbo-0301 model: gpt-3.5-turbo-0301
released: 2023-03-01
edit_format: whole edit_format: whole
commit_hash: 44388db-dirty commit_hash: 44388db-dirty
pass_rate_1: 50.4 pass_rate_1: 50.4
@ -126,9 +134,11 @@
versions: 0.16.4-dev versions: 0.16.4-dev
seconds_per_case: 6.5 seconds_per_case: 6.5
total_cost: 0.4822 total_cost: 0.4822
- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613 - dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
test_cases: 133 test_cases: 133
model: gpt-3.5-turbo-0613 model: gpt-3.5-turbo-0613
released: 2023-06-13
edit_format: whole edit_format: whole
commit_hash: 93aa497-dirty commit_hash: 93aa497-dirty
pass_rate_1: 38.3 pass_rate_1: 38.3
@ -168,9 +178,11 @@
versions: 0.30.2-dev versions: 0.30.2-dev
seconds_per_case: 5.3 seconds_per_case: 5.3
total_cost: 0.3261 total_cost: 0.3261
- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff - dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
test_cases: 133 test_cases: 133
model: gpt-4-0125-preview model: gpt-4-0125-preview
released: 2024-01-25
edit_format: udiff edit_format: udiff
commit_hash: edcf9b1 commit_hash: edcf9b1
pass_rate_1: 55.6 pass_rate_1: 55.6
@ -189,9 +201,11 @@
versions: 0.22.1-dev versions: 0.22.1-dev
seconds_per_case: 44.8 seconds_per_case: 44.8
total_cost: 14.6428 total_cost: 14.6428
- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules - dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
test_cases: 133 test_cases: 133
model: gpt-4-0314 model: gpt-4-0314
released: 2023-03-14
edit_format: diff edit_format: diff
commit_hash: 0d43468 commit_hash: 0d43468
pass_rate_1: 50.4 pass_rate_1: 50.4
@ -210,9 +224,11 @@
versions: 0.31.2-dev versions: 0.31.2-dev
seconds_per_case: 19.8 seconds_per_case: 19.8
total_cost: 16.2689 total_cost: 16.2689
- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main - dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
test_cases: 133 test_cases: 133
model: gpt-4-0613 model: gpt-4-0613
released: 2023-06-13
edit_format: diff edit_format: diff
commit_hash: 3aa17c4 commit_hash: 3aa17c4
pass_rate_1: 46.6 pass_rate_1: 46.6
@ -235,6 +251,7 @@
- dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff - dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
test_cases: 133 test_cases: 133
model: gpt-4-1106-preview model: gpt-4-1106-preview
released: 2023-11-06
edit_format: udiff edit_format: udiff
commit_hash: 87664dc commit_hash: 87664dc
pass_rate_1: 51.9 pass_rate_1: 51.9
@ -256,7 +273,8 @@
- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples - dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
test_cases: 133 test_cases: 133
model: gpt-4-turbo-2024-04-09 model: gpt-4-turbo-2024-04-09 (udiff)
released: 2024-04-09
edit_format: udiff edit_format: udiff
commit_hash: e610e5b-dirty commit_hash: e610e5b-dirty
pass_rate_1: 48.1 pass_rate_1: 48.1
@ -275,9 +293,11 @@
versions: 0.30.2-dev versions: 0.30.2-dev
seconds_per_case: 22.8 seconds_per_case: 22.8
total_cost: 6.3337 total_cost: 6.3337
- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg - dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
test_cases: 132 test_cases: 132
model: llama3-70b-8192 model: llama3-70b-8192
released: 2024-04-18
edit_format: diff edit_format: diff
commit_hash: b5bb453 commit_hash: b5bb453
pass_rate_1: 38.6 pass_rate_1: 38.6
@ -296,9 +316,11 @@
versions: 0.31.2-dev versions: 0.31.2-dev
seconds_per_case: 14.5 seconds_per_case: 14.5
total_cost: 0.4311 total_cost: 0.4311
- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final - dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
test_cases: 133 test_cases: 133
model: command-r-plus model: command-r-plus
released: 2024-04-04
edit_format: whole edit_format: whole
commit_hash: fc3a43e-dirty commit_hash: fc3a43e-dirty
pass_rate_1: 21.8 pass_rate_1: 21.8
@ -317,6 +339,7 @@
versions: 0.31.2-dev versions: 0.31.2-dev
seconds_per_case: 22.9 seconds_per_case: 22.9
total_cost: 2.7494 total_cost: 2.7494
- dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole - dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
test_cases: 133 test_cases: 133
model: deepseek-chat v2 (whole) model: deepseek-chat v2 (whole)
@ -342,6 +365,7 @@
- dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2 - dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
test_cases: 133 test_cases: 133
model: deepseek-chat v2 (diff) model: deepseek-chat v2 (diff)
released: 2024-05-06
edit_format: diff edit_format: diff
commit_hash: 80a3f6d commit_hash: 80a3f6d
pass_rate_1: 44.4 pass_rate_1: 44.4
@ -364,6 +388,7 @@
- dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole - dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
test_cases: 133 test_cases: 133
model: qwen1.5-110b-chat model: qwen1.5-110b-chat
released: 2024-02-04
edit_format: whole edit_format: whole
commit_hash: 70b1c0c commit_hash: 70b1c0c
pass_rate_1: 30.8 pass_rate_1: 30.8
@ -382,6 +407,7 @@
versions: 0.31.2-dev versions: 0.31.2-dev
seconds_per_case: 46.9 seconds_per_case: 46.9
total_cost: 0.0000 total_cost: 0.0000
- dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole - dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
test_cases: 133 test_cases: 133
model: WizardLM-2 8x22B model: WizardLM-2 8x22B
@ -406,7 +432,8 @@
- dirname: 2024-05-13-17-39-05--gpt-4o-diff - dirname: 2024-05-13-17-39-05--gpt-4o-diff
test_cases: 133 test_cases: 133
model: openai/gpt-4o model: gpt-4o
released: 2024-05-13
edit_format: diff edit_format: diff
commit_hash: b6cd852 commit_hash: b6cd852
pass_rate_1: 60.2 pass_rate_1: 60.2
@ -426,3 +453,25 @@
seconds_per_case: 6.0 seconds_per_case: 6.0
total_cost: 0.0000 total_cost: 0.0000
- dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff
test_cases: 33
model: gpt-4-turbo-2024-04-09 (diff)
edit_format: diff
commit_hash: 9b2e697-dirty
pass_rate_1: 48.5
pass_rate_2: 57.6
percent_cases_well_formed: 100.0
error_outputs: 15
num_malformed_responses: 0
user_asks: 15
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4-turbo-2024-04-09
date: 2024-04-12
versions: 0.28.1-dev
seconds_per_case: 17.6
total_cost: 1.6205

View file

@ -40,7 +40,7 @@
total_cost: 27.9176 total_cost: 27.9176
- dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09 - dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
test_cases: 88 test_cases: 88
model: gpt-4-turbo-2024-04-09 model: gpt-4-turbo-2024-04-09 (udiff)
edit_format: udiff edit_format: udiff
commit_hash: b75fdb9 commit_hash: b75fdb9
pass_rate_1: 34.1 pass_rate_1: 34.1
@ -103,7 +103,7 @@
- dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff - dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
test_cases: 89 test_cases: 89
model: openai/gpt-4o model: gpt-4o
edit_format: diff edit_format: diff
commit_hash: b6cd852 commit_hash: b6cd852
pass_rate_1: 62.9 pass_rate_1: 62.9
@ -121,3 +121,26 @@
versions: 0.34.1-dev versions: 0.34.1-dev
seconds_per_case: 27.8 seconds_per_case: 27.8
total_cost: 0.0000 total_cost: 0.0000
- dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff
test_cases: 88
model: gpt-4-turbo-2024-04-09 (diff)
edit_format: diff
commit_hash: 7875418
pass_rate_1: 21.4
percent_cases_well_formed: 6.8
error_outputs: 247
num_malformed_responses: 82
user_asks: 1
lazy_comments: 2
syntax_errors: 3
indentation_errors: 8
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4-turbo-2024-04-09
date: 2024-04-10
versions: 0.28.1-dev
seconds_per_case: 67.8
total_cost: 20.4889

View file

@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root):
const=gpt_4_model, const=gpt_4_model,
help=f"Use {gpt_4_model} model for the main chat", help=f"Use {gpt_4_model} model for the main chat",
) )
gpt_4o_model = "openai/gpt-4o" gpt_4o_model = "gpt-4o"
group.add_argument( group.add_argument(
"--4o", "--4o",
action="store_const", action="store_const",

View file

@ -11,7 +11,7 @@ from PIL import Image
from aider.dump import dump # noqa: F401 from aider.dump import dump # noqa: F401
from aider.litellm import litellm from aider.litellm import litellm
DEFAULT_MODEL_NAME = "openai/gpt-4o" DEFAULT_MODEL_NAME = "gpt-4o"
@dataclass @dataclass
@ -94,6 +94,16 @@ MODEL_SETTINGS = [
lazy=True, lazy=True,
reminder_as_sys_msg=True, reminder_as_sys_msg=True,
), ),
ModelSettings(
"gpt-4o",
"diff",
weak_model_name="gpt-3.5-turbo",
use_repo_map=True,
send_undo_reply=True,
accepts_images=True,
lazy=True,
reminder_as_sys_msg=True,
),
ModelSettings( ModelSettings(
"gpt-4-0125-preview", "gpt-4-0125-preview",
"udiff", "udiff",

1742
assets/models-over-time.svg Normal file

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 53 KiB

57
benchmark/over_time.py Normal file
View file

@ -0,0 +1,57 @@
import matplotlib.pyplot as plt
import yaml
from imgcat import imgcat
from matplotlib import rc
def plot_over_time(yaml_file):
with open(yaml_file, "r") as file:
data = yaml.safe_load(file)
dates = []
pass_rates = []
models = []
for entry in data:
if "released" in entry and "pass_rate_2" in entry:
dates.append(entry["released"])
pass_rates.append(entry["pass_rate_2"])
models.append(entry["model"].split("(")[0].strip())
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(10, 5))
ax.grid(axis="y", zorder=0, lw=0.2)
for spine in ax.spines.values():
spine.set_edgecolor("#DDDDDD")
spine.set_linewidth(0.5)
colors = [
"red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
]
ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
for i, model in enumerate(models):
ax.annotate(
model,
(dates[i], pass_rates[i]),
fontsize=12,
alpha=0.75,
xytext=(5, 5),
textcoords="offset points",
)
ax.set_xlabel("Model release date", fontsize=18)
ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
ax.set_title("LLM code editing skill by model release date", fontsize=20)
plt.tight_layout()
plt.savefig("tmp_over_time.png")
plt.savefig("tmp_over_time.svg")
imgcat(fig)
# Example usage
plot_over_time("_data/edit_leaderboard.yml")

View file

@ -185,6 +185,10 @@ Therefore, results are available for fewer models.
</script> </script>
## LLM code editing skill by model release date
[![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg)
## Notes on benchmarking results ## Notes on benchmarking results

View file

@ -1,4 +1,4 @@
[pytest] [pytest]
norecursedirs = tmp.* build benchmark norecursedirs = tmp.* build benchmark _site OLD
addopts = -p no:warnings addopts = -p no:warnings

20
scripts/Dockerfile.jekyll Normal file
View file

@ -0,0 +1,20 @@
# Use the official Jekyll image from Docker Hub
FROM jekyll/jekyll:latest
# Set the working directory
WORKDIR /srv/jekyll
# Copy the current directory contents into the container at /srv/jekyll
COPY . /srv/jekyll
# Install any needed packages specified in Gemfile
RUN bundle install
# Expose port 4000 to the host
EXPOSE 4000
# Health check to ensure the server is running
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1
# Run Jekyll server
CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"]

4
scripts/jekyll_build.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/bash
# Build the Docker image
docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll .

4
scripts/jekyll_run.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/bash
# Run the Docker container
docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site