diff --git a/.gitignore b/.gitignore index 60a75225c..846614ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ .aider* aider_chat.egg-info/ build +Gemfile.lock +_site +.jekyll-cache/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2e71fe622..640b25c33 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This helps us to keep track of them and discuss potential solutions or enhancements. +LLM Benchmark Results +--------------------- + +Contributions of +[LLM benchmark results](https://aider.chat/docs/leaderboards/) +are welcome! +See the +[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md) +for information on running aider's code editing benchmarks. +Submit results by opening a PR with edits to the +[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/). + + Pull Requests ------------- diff --git a/Gemfile b/Gemfile new file mode 100644 index 000000000..4bbe4a2ef --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +gem 'jekyll' +gem 'jekyll-theme-cayman' +gem 'jekyll-redirect-from' +gem 'jekyll-sitemap' diff --git a/_config.yml b/_config.yml index 3779e1384..6aad2a4d0 100644 --- a/_config.yml +++ b/_config.yml @@ -11,3 +11,6 @@ defaults: type: "pages" values: description: "A command-line chat tool for coding with GPT" + +exclude: + - tmp.benchmarks diff --git a/_data/edit_leaderboard.yml b/_data/edit_leaderboard.yml index 8570970f2..0d90394b3 100644 --- a/_data/edit_leaderboard.yml +++ b/_data/edit_leaderboard.yml @@ -1,6 +1,7 @@ - dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence test_cases: 133 model: claude-3-opus-20240229 + released: 2024-02-29 edit_format: diff commit_hash: f4b1797-dirty, f4b1797 pass_rate_1: 53.4 @@ -19,9 +20,11 @@ versions: 0.30.2-dev seconds_per_case: 32.4 total_cost: 13.8395 + - dirname: 2024-03-06-16-42-00--claude3-sonnet-whole test_cases: 133 model: claude-3-sonnet-20240229 + released: 2024-02-29 edit_format: whole commit_hash: a5f8076-dirty pass_rate_1: 43.6 @@ -40,9 +43,11 @@ versions: 0.25.1-dev seconds_per_case: 23.1 total_cost: 0.0000 + - dirname: 2024-04-29-19-17-28--deepseek-coder-whole test_cases: 132 model: deepseek-coder + released: 2024-01-25 edit_format: whole commit_hash: c07f793-dirty pass_rate_1: 47.0 @@ -61,6 +66,7 @@ versions: 0.30.2-dev seconds_per_case: 26.7 total_cost: 0.0000 + - dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced test_cases: 133 model: gemini-1.5-pro-latest @@ -86,6 +92,7 @@ - dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole test_cases: 133 model: gpt-3.5-turbo-0125 + released: 2024-01-25 edit_format: whole commit_hash: 1d55f74 pass_rate_1: 41.4 @@ -108,6 +115,7 @@ - dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301 test_cases: 133 model: gpt-3.5-turbo-0301 + released: 2023-03-01 edit_format: whole commit_hash: 44388db-dirty pass_rate_1: 50.4 @@ -126,9 +134,11 @@ versions: 0.16.4-dev seconds_per_case: 6.5 total_cost: 0.4822 + - dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613 test_cases: 133 model: gpt-3.5-turbo-0613 + released: 2023-06-13 edit_format: whole commit_hash: 93aa497-dirty pass_rate_1: 38.3 @@ -168,9 +178,11 @@ versions: 0.30.2-dev seconds_per_case: 5.3 total_cost: 0.3261 + - dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff test_cases: 133 model: gpt-4-0125-preview + released: 2024-01-25 edit_format: udiff commit_hash: edcf9b1 pass_rate_1: 55.6 @@ -189,9 +201,11 @@ versions: 0.22.1-dev seconds_per_case: 44.8 total_cost: 14.6428 + - dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules test_cases: 133 model: gpt-4-0314 + released: 2023-03-14 edit_format: diff commit_hash: 0d43468 pass_rate_1: 50.4 @@ -210,9 +224,11 @@ versions: 0.31.2-dev seconds_per_case: 19.8 total_cost: 16.2689 + - dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main test_cases: 133 model: gpt-4-0613 + released: 2023-06-13 edit_format: diff commit_hash: 3aa17c4 pass_rate_1: 46.6 @@ -235,6 +251,7 @@ - dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff test_cases: 133 model: gpt-4-1106-preview + released: 2023-11-06 edit_format: udiff commit_hash: 87664dc pass_rate_1: 51.9 @@ -256,7 +273,8 @@ - dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples test_cases: 133 - model: gpt-4-turbo-2024-04-09 + model: gpt-4-turbo-2024-04-09 (udiff) + released: 2024-04-09 edit_format: udiff commit_hash: e610e5b-dirty pass_rate_1: 48.1 @@ -275,9 +293,11 @@ versions: 0.30.2-dev seconds_per_case: 22.8 total_cost: 6.3337 + - dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg test_cases: 132 model: llama3-70b-8192 + released: 2024-04-18 edit_format: diff commit_hash: b5bb453 pass_rate_1: 38.6 @@ -296,9 +316,11 @@ versions: 0.31.2-dev seconds_per_case: 14.5 total_cost: 0.4311 + - dirname: 2024-05-06-18-31-08--command-r-plus-whole-final test_cases: 133 model: command-r-plus + released: 2024-04-04 edit_format: whole commit_hash: fc3a43e-dirty pass_rate_1: 21.8 @@ -317,6 +339,7 @@ versions: 0.31.2-dev seconds_per_case: 22.9 total_cost: 2.7494 + - dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole test_cases: 133 model: deepseek-chat v2 (whole) @@ -342,6 +365,7 @@ - dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2 test_cases: 133 model: deepseek-chat v2 (diff) + released: 2024-05-06 edit_format: diff commit_hash: 80a3f6d pass_rate_1: 44.4 @@ -364,6 +388,7 @@ - dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole test_cases: 133 model: qwen1.5-110b-chat + released: 2024-02-04 edit_format: whole commit_hash: 70b1c0c pass_rate_1: 30.8 @@ -382,6 +407,7 @@ versions: 0.31.2-dev seconds_per_case: 46.9 total_cost: 0.0000 + - dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole test_cases: 133 model: WizardLM-2 8x22B @@ -406,7 +432,8 @@ - dirname: 2024-05-13-17-39-05--gpt-4o-diff test_cases: 133 - model: openai/gpt-4o + model: gpt-4o + released: 2024-05-13 edit_format: diff commit_hash: b6cd852 pass_rate_1: 60.2 @@ -426,3 +453,25 @@ seconds_per_case: 6.0 total_cost: 0.0000 +- dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff + test_cases: 33 + model: gpt-4-turbo-2024-04-09 (diff) + edit_format: diff + commit_hash: 9b2e697-dirty + pass_rate_1: 48.5 + pass_rate_2: 57.6 + percent_cases_well_formed: 100.0 + error_outputs: 15 + num_malformed_responses: 0 + user_asks: 15 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4-turbo-2024-04-09 + date: 2024-04-12 + versions: 0.28.1-dev + seconds_per_case: 17.6 + total_cost: 1.6205 + \ No newline at end of file diff --git a/_data/refactor_leaderboard.yml b/_data/refactor_leaderboard.yml index f1232d34e..db4d3483f 100644 --- a/_data/refactor_leaderboard.yml +++ b/_data/refactor_leaderboard.yml @@ -40,7 +40,7 @@ total_cost: 27.9176 - dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09 test_cases: 88 - model: gpt-4-turbo-2024-04-09 + model: gpt-4-turbo-2024-04-09 (udiff) edit_format: udiff commit_hash: b75fdb9 pass_rate_1: 34.1 @@ -103,7 +103,7 @@ - dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff test_cases: 89 - model: openai/gpt-4o + model: gpt-4o edit_format: diff commit_hash: b6cd852 pass_rate_1: 62.9 @@ -120,4 +120,27 @@ date: 2024-05-13 versions: 0.34.1-dev seconds_per_case: 27.8 - total_cost: 0.0000 \ No newline at end of file + total_cost: 0.0000 + +- dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff + test_cases: 88 + model: gpt-4-turbo-2024-04-09 (diff) + edit_format: diff + commit_hash: 7875418 + pass_rate_1: 21.4 + percent_cases_well_formed: 6.8 + error_outputs: 247 + num_malformed_responses: 82 + user_asks: 1 + lazy_comments: 2 + syntax_errors: 3 + indentation_errors: 8 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4-turbo-2024-04-09 + date: 2024-04-10 + versions: 0.28.1-dev + seconds_per_case: 67.8 + total_cost: 20.4889 + + \ No newline at end of file diff --git a/aider/args.py b/aider/args.py index f21e7f297..3b42d074d 100644 --- a/aider/args.py +++ b/aider/args.py @@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root): const=gpt_4_model, help=f"Use {gpt_4_model} model for the main chat", ) - gpt_4o_model = "openai/gpt-4o" + gpt_4o_model = "gpt-4o" group.add_argument( "--4o", action="store_const", diff --git a/aider/models.py b/aider/models.py index 975367061..c32840d82 100644 --- a/aider/models.py +++ b/aider/models.py @@ -11,7 +11,7 @@ from PIL import Image from aider.dump import dump # noqa: F401 from aider.litellm import litellm -DEFAULT_MODEL_NAME = "openai/gpt-4o" +DEFAULT_MODEL_NAME = "gpt-4o" @dataclass @@ -94,6 +94,16 @@ MODEL_SETTINGS = [ lazy=True, reminder_as_sys_msg=True, ), + ModelSettings( + "gpt-4o", + "diff", + weak_model_name="gpt-3.5-turbo", + use_repo_map=True, + send_undo_reply=True, + accepts_images=True, + lazy=True, + reminder_as_sys_msg=True, + ), ModelSettings( "gpt-4-0125-preview", "udiff", diff --git a/assets/models-over-time.svg b/assets/models-over-time.svg new file mode 100644 index 000000000..994dab6ae --- /dev/null +++ b/assets/models-over-time.svg @@ -0,0 +1,1742 @@ + + + + + + + + 2024-05-15T11:52:34.512395 + image/svg+xml + + + Matplotlib v3.8.4, https://matplotlib.orgdiff --git a/benchmark/over_time.py b/benchmark/over_time.py new file mode 100644 index 000000000..33e80e67e --- /dev/null +++ b/benchmark/over_time.py @@ -0,0 +1,57 @@ +import matplotlib.pyplot as plt +import yaml +from imgcat import imgcat +from matplotlib import rc + + +def plot_over_time(yaml_file): + with open(yaml_file, "r") as file: + data = yaml.safe_load(file) + + dates = [] + pass_rates = [] + models = [] + + for entry in data: + if "released" in entry and "pass_rate_2" in entry: + dates.append(entry["released"]) + pass_rates.append(entry["pass_rate_2"]) + models.append(entry["model"].split("(")[0].strip()) + + plt.rcParams["hatch.linewidth"] = 0.5 + plt.rcParams["hatch.color"] = "#444444" + + rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10}) + + fig, ax = plt.subplots(figsize=(10, 5)) + ax.grid(axis="y", zorder=0, lw=0.2) + for spine in ax.spines.values(): + spine.set_edgecolor("#DDDDDD") + spine.set_linewidth(0.5) + + colors = [ + "red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models + ] + ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120) + + for i, model in enumerate(models): + ax.annotate( + model, + (dates[i], pass_rates[i]), + fontsize=12, + alpha=0.75, + xytext=(5, 5), + textcoords="offset points", + ) + + ax.set_xlabel("Model release date", fontsize=18) + ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18) + ax.set_title("LLM code editing skill by model release date", fontsize=20) + plt.tight_layout() + plt.savefig("tmp_over_time.png") + plt.savefig("tmp_over_time.svg") + imgcat(fig) + + +# Example usage +plot_over_time("_data/edit_leaderboard.yml") diff --git a/docs/leaderboards/index.md b/docs/leaderboards/index.md index 6f356ac6e..d33cc6a54 100644 --- a/docs/leaderboards/index.md +++ b/docs/leaderboards/index.md @@ -185,6 +185,10 @@ Therefore, results are available for fewer models. +## LLM code editing skill by model release date + +[![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg) + ## Notes on benchmarking results diff --git a/pytest.ini b/pytest.ini index 8fa4e613f..d0da0980d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] -norecursedirs = tmp.* build benchmark +norecursedirs = tmp.* build benchmark _site OLD addopts = -p no:warnings diff --git a/scripts/Dockerfile.jekyll b/scripts/Dockerfile.jekyll new file mode 100644 index 000000000..506f090b7 --- /dev/null +++ b/scripts/Dockerfile.jekyll @@ -0,0 +1,20 @@ +# Use the official Jekyll image from Docker Hub +FROM jekyll/jekyll:latest + +# Set the working directory +WORKDIR /srv/jekyll + +# Copy the current directory contents into the container at /srv/jekyll +COPY . /srv/jekyll + +# Install any needed packages specified in Gemfile +RUN bundle install + +# Expose port 4000 to the host +EXPOSE 4000 + +# Health check to ensure the server is running +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1 + +# Run Jekyll server +CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"] diff --git a/scripts/jekyll_build.sh b/scripts/jekyll_build.sh new file mode 100755 index 000000000..bc41c66ca --- /dev/null +++ b/scripts/jekyll_build.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Build the Docker image +docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll . diff --git a/scripts/jekyll_run.sh b/scripts/jekyll_run.sh new file mode 100755 index 000000000..d31ac0c65 --- /dev/null +++ b/scripts/jekyll_run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Run the Docker container +docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site