Merge branch 'main' into swe-bench

2025-05-30 17:24:59 +00:00 · 2024-05-15 12:02:00 -07:00 · 2024-05-15 12:02:00 -07:00 · df84bcf38b
commit df84bcf38b
parent ecc46bd3e3 74c0df8df8
15 changed files with 1945 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,6 @@
 .aider*
 aider_chat.egg-info/
 build
 Gemfile.lock
 _site
 .jekyll-cache/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This
 helps us to keep track of them and discuss potential solutions or
 enhancements.
 LLM Benchmark Results
 ---------------------
 Contributions of
 [LLM benchmark results](https://aider.chat/docs/leaderboards/)
 are welcome!
 See the
 [benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
 for information on running aider's code editing benchmarks.
 Submit results by opening a PR with edits to the
 [benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
 Pull Requests
 -------------
--- a/5
+++ b/5
@ -0,0 +1,5 @@
 source 'https://rubygems.org'
 gem 'jekyll'
 gem 'jekyll-theme-cayman'
 gem 'jekyll-redirect-from'
 gem 'jekyll-sitemap'
--- a/_config.yml
+++ b/_config.yml
@ -11,3 +11,6 @@ defaults:
      type: "pages"
    values:
      description: "A command-line chat tool for coding with GPT"
 exclude:
  - tmp.benchmarks
--- a/_data/edit_leaderboard.yml
+++ b/_data/edit_leaderboard.yml
@ -1,6 +1,7 @@
 - dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
  test_cases: 133
  model: claude-3-opus-20240229
  released: 2024-02-29
  edit_format: diff
  commit_hash: f4b1797-dirty, f4b1797
  pass_rate_1: 53.4
@ -19,9 +20,11 @@
  versions: 0.30.2-dev
  seconds_per_case: 32.4
  total_cost: 13.8395
 - dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
  test_cases: 133
  model: claude-3-sonnet-20240229
  released: 2024-02-29
  edit_format: whole
  commit_hash: a5f8076-dirty
  pass_rate_1: 43.6
@ -40,9 +43,11 @@
  versions: 0.25.1-dev
  seconds_per_case: 23.1
  total_cost: 0.0000
 - dirname: 2024-04-29-19-17-28--deepseek-coder-whole
  test_cases: 132
  model: deepseek-coder
  released: 2024-01-25
  edit_format: whole
  commit_hash: c07f793-dirty
  pass_rate_1: 47.0
@ -61,6 +66,7 @@
  versions: 0.30.2-dev
  seconds_per_case: 26.7
  total_cost: 0.0000
 - dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
  test_cases: 133
  model: gemini-1.5-pro-latest
@ -86,6 +92,7 @@
 - dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
  test_cases: 133
  model: gpt-3.5-turbo-0125
  released: 2024-01-25
  edit_format: whole
  commit_hash: 1d55f74
  pass_rate_1: 41.4
@ -108,6 +115,7 @@
 - dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
  test_cases: 133
  model: gpt-3.5-turbo-0301
  released: 2023-03-01
  edit_format: whole
  commit_hash: 44388db-dirty
  pass_rate_1: 50.4
@ -126,9 +134,11 @@
  versions: 0.16.4-dev
  seconds_per_case: 6.5
  total_cost: 0.4822
 - dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
  test_cases: 133
  model: gpt-3.5-turbo-0613
  released: 2023-06-13
  edit_format: whole
  commit_hash: 93aa497-dirty
  pass_rate_1: 38.3
@ -168,9 +178,11 @@
  versions: 0.30.2-dev
  seconds_per_case: 5.3
  total_cost: 0.3261
 - dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
  test_cases: 133
  model: gpt-4-0125-preview
  released: 2024-01-25
  edit_format: udiff
  commit_hash: edcf9b1
  pass_rate_1: 55.6
@ -189,9 +201,11 @@
  versions: 0.22.1-dev
  seconds_per_case: 44.8
  total_cost: 14.6428
 - dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
  test_cases: 133
  model: gpt-4-0314
  released: 2023-03-14
  edit_format: diff
  commit_hash: 0d43468
  pass_rate_1: 50.4
@ -210,9 +224,11 @@
  versions: 0.31.2-dev
  seconds_per_case: 19.8
  total_cost: 16.2689
 - dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
  test_cases: 133
  model: gpt-4-0613
  released: 2023-06-13
  edit_format: diff
  commit_hash: 3aa17c4
  pass_rate_1: 46.6
@ -235,6 +251,7 @@
 - dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
  test_cases: 133
  model: gpt-4-1106-preview
  released: 2023-11-06  
  edit_format: udiff
  commit_hash: 87664dc
  pass_rate_1: 51.9
@ -256,7 +273,8 @@
 - dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
  test_cases: 133
-  model: gpt-4-turbo-2024-04-09
+  model: gpt-4-turbo-2024-04-09 (udiff)
  released: 2024-04-09
  edit_format: udiff
  commit_hash: e610e5b-dirty
  pass_rate_1: 48.1
@ -275,9 +293,11 @@
  versions: 0.30.2-dev
  seconds_per_case: 22.8
  total_cost: 6.3337
 - dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
  test_cases: 132
  model: llama3-70b-8192
  released: 2024-04-18
  edit_format: diff
  commit_hash: b5bb453
  pass_rate_1: 38.6
@ -296,9 +316,11 @@
  versions: 0.31.2-dev
  seconds_per_case: 14.5
  total_cost: 0.4311
 - dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
  test_cases: 133
  model: command-r-plus
  released: 2024-04-04
  edit_format: whole
  commit_hash: fc3a43e-dirty
  pass_rate_1: 21.8
@ -317,6 +339,7 @@
  versions: 0.31.2-dev
  seconds_per_case: 22.9
  total_cost: 2.7494
 - dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
  test_cases: 133
  model: deepseek-chat v2 (whole)
@ -342,6 +365,7 @@
 - dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
  test_cases: 133
  model: deepseek-chat v2 (diff)
  released: 2024-05-06
  edit_format: diff
  commit_hash: 80a3f6d
  pass_rate_1: 44.4
@ -364,6 +388,7 @@
 - dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
  test_cases: 133
  model: qwen1.5-110b-chat
  released: 2024-02-04  
  edit_format: whole
  commit_hash: 70b1c0c
  pass_rate_1: 30.8
@ -382,6 +407,7 @@
  versions: 0.31.2-dev
  seconds_per_case: 46.9
  total_cost: 0.0000
 - dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
  test_cases: 133
  model: WizardLM-2 8x22B
@ -406,7 +432,8 @@
 - dirname: 2024-05-13-17-39-05--gpt-4o-diff
  test_cases: 133
-  model: openai/gpt-4o
+  model: gpt-4o
  released: 2024-05-13
  edit_format: diff
  commit_hash: b6cd852
  pass_rate_1: 60.2
@ -426,3 +453,25 @@
  seconds_per_case: 6.0
  total_cost: 0.0000
 - dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff
  test_cases: 33
  model: gpt-4-turbo-2024-04-09 (diff)
  edit_format: diff
  commit_hash: 9b2e697-dirty
  pass_rate_1: 48.5
  pass_rate_2: 57.6
  percent_cases_well_formed: 100.0
  error_outputs: 15
  num_malformed_responses: 0
  user_asks: 15
  lazy_comments: 0
  syntax_errors: 0
  indentation_errors: 0
  exhausted_context_windows: 0
  test_timeouts: 0
  command: aider --model gpt-4-turbo-2024-04-09
  date: 2024-04-12
  versions: 0.28.1-dev
  seconds_per_case: 17.6
  total_cost: 1.6205
--- a/_data/refactor_leaderboard.yml
+++ b/_data/refactor_leaderboard.yml
@ -40,7 +40,7 @@
  total_cost: 27.9176
 - dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
  test_cases: 88
-  model: gpt-4-turbo-2024-04-09
+  model: gpt-4-turbo-2024-04-09 (udiff)
  edit_format: udiff
  commit_hash: b75fdb9
  pass_rate_1: 34.1
@ -103,7 +103,7 @@
 - dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
  test_cases: 89
-  model: openai/gpt-4o
+  model: gpt-4o
  edit_format: diff
  commit_hash: b6cd852
  pass_rate_1: 62.9
@ -121,3 +121,26 @@
  versions: 0.34.1-dev
  seconds_per_case: 27.8
  total_cost: 0.0000
 - dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff
  test_cases: 88
  model: gpt-4-turbo-2024-04-09 (diff)
  edit_format: diff
  commit_hash: 7875418
  pass_rate_1: 21.4
  percent_cases_well_formed: 6.8
  error_outputs: 247
  num_malformed_responses: 82
  user_asks: 1
  lazy_comments: 2
  syntax_errors: 3
  indentation_errors: 8
  exhausted_context_windows: 0
  test_timeouts: 0
  command: aider --model gpt-4-turbo-2024-04-09
  date: 2024-04-10
  versions: 0.28.1-dev
  seconds_per_case: 67.8
  total_cost: 20.4889
--- a/aider/args.py
+++ b/aider/args.py
@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root):
        const=gpt_4_model,
        help=f"Use {gpt_4_model} model for the main chat",
    )
-    gpt_4o_model = "openai/gpt-4o"
+    gpt_4o_model = "gpt-4o"
    group.add_argument(
        "--4o",
        action="store_const",
--- a/aider/models.py
+++ b/aider/models.py
@ -11,7 +11,7 @@ from PIL import Image
 from aider.dump import dump  # noqa: F401
 from aider.litellm import litellm
-DEFAULT_MODEL_NAME = "openai/gpt-4o"
+DEFAULT_MODEL_NAME = "gpt-4o"
@dataclass
@ -94,6 +94,16 @@ MODEL_SETTINGS = [
        lazy=True,
        reminder_as_sys_msg=True,
    ),
    ModelSettings(
        "gpt-4o",
        "diff",
        weak_model_name="gpt-3.5-turbo",
        use_repo_map=True,
        send_undo_reply=True,
        accepts_images=True,
        lazy=True,
        reminder_as_sys_msg=True,
    ),
    ModelSettings(
        "gpt-4-0125-preview",
        "udiff",
--- a/assets/models-over-time.svg
+++ b/assets/models-over-time.svg
--- a/benchmark/over_time.py
+++ b/benchmark/over_time.py
@ -0,0 +1,57 @@
 import matplotlib.pyplot as plt
 import yaml
 from imgcat import imgcat
 from matplotlib import rc
 def plot_over_time(yaml_file):
    with open(yaml_file, "r") as file:
        data = yaml.safe_load(file)
    dates = []
    pass_rates = []
    models = []
    for entry in data:
        if "released" in entry and "pass_rate_2" in entry:
            dates.append(entry["released"])
            pass_rates.append(entry["pass_rate_2"])
            models.append(entry["model"].split("(")[0].strip())
    plt.rcParams["hatch.linewidth"] = 0.5
    plt.rcParams["hatch.color"] = "#444444"
    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.grid(axis="y", zorder=0, lw=0.2)
    for spine in ax.spines.values():
        spine.set_edgecolor("#DDDDDD")
        spine.set_linewidth(0.5)
    colors = [
        "red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
    ]
    ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
    for i, model in enumerate(models):
        ax.annotate(
            model,
            (dates[i], pass_rates[i]),
            fontsize=12,
            alpha=0.75,
            xytext=(5, 5),
            textcoords="offset points",
        )
    ax.set_xlabel("Model release date", fontsize=18)
    ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
    ax.set_title("LLM code editing skill by model release date", fontsize=20)
    plt.tight_layout()
    plt.savefig("tmp_over_time.png")
    plt.savefig("tmp_over_time.svg")
    imgcat(fig)
 # Example usage
 plot_over_time("_data/edit_leaderboard.yml")
--- a/docs/leaderboards/index.md
+++ b/docs/leaderboards/index.md
@ -185,6 +185,10 @@ Therefore, results are available for fewer models.
 </script>
 ## LLM code editing skill by model release date
 [![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg)
 ## Notes on benchmarking results
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,4 @@
 [pytest]
-norecursedirs = tmp.* build benchmark
+norecursedirs = tmp.* build benchmark _site OLD
 addopts = -p no:warnings
--- a/scripts/Dockerfile.jekyll
+++ b/scripts/Dockerfile.jekyll
@ -0,0 +1,20 @@
 # Use the official Jekyll image from Docker Hub
 FROM jekyll/jekyll:latest
 # Set the working directory
 WORKDIR /srv/jekyll
 # Copy the current directory contents into the container at /srv/jekyll
 COPY . /srv/jekyll
 # Install any needed packages specified in Gemfile
 RUN bundle install
 # Expose port 4000 to the host
 EXPOSE 4000
 # Health check to ensure the server is running
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1
 # Run Jekyll server
 CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"]
--- a/scripts/jekyll_build.sh
+++ b/scripts/jekyll_build.sh
@ -0,0 +1,4 @@
 #!/bin/bash
 # Build the Docker image
 docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll .
--- a/scripts/jekyll_run.sh
+++ b/scripts/jekyll_run.sh
@ -0,0 +1,4 @@
 #!/bin/bash
 # Run the Docker container
 docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site