Merge branch 'main' into swe-bench

2025-05-29 08:44:59 +00:00 · 2024-05-15 12:02:00 -07:00 · 2024-05-15 12:02:00 -07:00 · df84bcf38b
commit df84bcf38b
parent ecc46bd3e3 74c0df8df8
15 changed files with 1945 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,6 @@
 .aider*
 aider_chat.egg-info/
 build
+Gemfile.lock
+_site
+.jekyll-cache/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This
 helps us to keep track of them and discuss potential solutions or
 enhancements.

+LLM Benchmark Results
+---------------------
+
+Contributions of
+[LLM benchmark results](https://aider.chat/docs/leaderboards/)
+are welcome!
+See the
+[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
+for information on running aider's code editing benchmarks.
+Submit results by opening a PR with edits to the
+[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
+
+
 Pull Requests
 -------------

--- a/5
+++ b/5
@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+gem 'jekyll'
+gem 'jekyll-theme-cayman'
+gem 'jekyll-redirect-from'
+gem 'jekyll-sitemap'
--- a/_config.yml
+++ b/_config.yml
@ -11,3 +11,6 @@ defaults:
      type: "pages"
    values:
      description: "A command-line chat tool for coding with GPT"
+
+exclude:
+  - tmp.benchmarks
--- a/_data/edit_leaderboard.yml
+++ b/_data/edit_leaderboard.yml
@ -1,6 +1,7 @@
 - dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
  test_cases: 133
  model: claude-3-opus-20240229
+  released: 2024-02-29
  edit_format: diff
  commit_hash: f4b1797-dirty, f4b1797
  pass_rate_1: 53.4
@ -19,9 +20,11 @@
  versions: 0.30.2-dev
  seconds_per_case: 32.4
  total_cost: 13.8395
+  
 - dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
  test_cases: 133
  model: claude-3-sonnet-20240229
+  released: 2024-02-29
  edit_format: whole
  commit_hash: a5f8076-dirty
  pass_rate_1: 43.6
@ -40,9 +43,11 @@
  versions: 0.25.1-dev
  seconds_per_case: 23.1
  total_cost: 0.0000
+  
 - dirname: 2024-04-29-19-17-28--deepseek-coder-whole
  test_cases: 132
  model: deepseek-coder
+  released: 2024-01-25
  edit_format: whole
  commit_hash: c07f793-dirty
  pass_rate_1: 47.0
@ -61,6 +66,7 @@
  versions: 0.30.2-dev
  seconds_per_case: 26.7
  total_cost: 0.0000
+  
 - dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
  test_cases: 133
  model: gemini-1.5-pro-latest
@ -86,6 +92,7 @@
 - dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
  test_cases: 133
  model: gpt-3.5-turbo-0125
+  released: 2024-01-25
  edit_format: whole
  commit_hash: 1d55f74
  pass_rate_1: 41.4
@ -108,6 +115,7 @@
 - dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
  test_cases: 133
  model: gpt-3.5-turbo-0301
+  released: 2023-03-01
  edit_format: whole
  commit_hash: 44388db-dirty
  pass_rate_1: 50.4
@ -126,9 +134,11 @@
  versions: 0.16.4-dev
  seconds_per_case: 6.5
  total_cost: 0.4822
+  
 - dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
  test_cases: 133
  model: gpt-3.5-turbo-0613
+  released: 2023-06-13
  edit_format: whole
  commit_hash: 93aa497-dirty
  pass_rate_1: 38.3
@ -168,9 +178,11 @@
  versions: 0.30.2-dev
  seconds_per_case: 5.3
  total_cost: 0.3261
+  
 - dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
  test_cases: 133
  model: gpt-4-0125-preview
+  released: 2024-01-25
  edit_format: udiff
  commit_hash: edcf9b1
  pass_rate_1: 55.6
@ -189,9 +201,11 @@
  versions: 0.22.1-dev
  seconds_per_case: 44.8
  total_cost: 14.6428
+  
 - dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
  test_cases: 133
  model: gpt-4-0314
+  released: 2023-03-14
  edit_format: diff
  commit_hash: 0d43468
  pass_rate_1: 50.4
@ -210,9 +224,11 @@
  versions: 0.31.2-dev
  seconds_per_case: 19.8
  total_cost: 16.2689
+  
 - dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
  test_cases: 133
  model: gpt-4-0613
+  released: 2023-06-13
  edit_format: diff
  commit_hash: 3aa17c4
  pass_rate_1: 46.6
@ -235,6 +251,7 @@
 - dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
  test_cases: 133
  model: gpt-4-1106-preview
+  released: 2023-11-06  
  edit_format: udiff
  commit_hash: 87664dc
  pass_rate_1: 51.9
@ -256,7 +273,8 @@
  
 - dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
  test_cases: 133
-  model: gpt-4-turbo-2024-04-09
+  model: gpt-4-turbo-2024-04-09 (udiff)
+  released: 2024-04-09
  edit_format: udiff
  commit_hash: e610e5b-dirty
  pass_rate_1: 48.1
@ -275,9 +293,11 @@
  versions: 0.30.2-dev
  seconds_per_case: 22.8
  total_cost: 6.3337
+  
 - dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
  test_cases: 132
  model: llama3-70b-8192
+  released: 2024-04-18
  edit_format: diff
  commit_hash: b5bb453
  pass_rate_1: 38.6
@ -296,9 +316,11 @@
  versions: 0.31.2-dev
  seconds_per_case: 14.5
  total_cost: 0.4311
+  
 - dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
  test_cases: 133
  model: command-r-plus
+  released: 2024-04-04
  edit_format: whole
  commit_hash: fc3a43e-dirty
  pass_rate_1: 21.8
@ -317,6 +339,7 @@
  versions: 0.31.2-dev
  seconds_per_case: 22.9
  total_cost: 2.7494
+  
 - dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
  test_cases: 133
  model: deepseek-chat v2 (whole)
@ -342,6 +365,7 @@
 - dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
  test_cases: 133
  model: deepseek-chat v2 (diff)
+  released: 2024-05-06
  edit_format: diff
  commit_hash: 80a3f6d
  pass_rate_1: 44.4
@ -364,6 +388,7 @@
 - dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
  test_cases: 133
  model: qwen1.5-110b-chat
+  released: 2024-02-04  
  edit_format: whole
  commit_hash: 70b1c0c
  pass_rate_1: 30.8
@ -382,6 +407,7 @@
  versions: 0.31.2-dev
  seconds_per_case: 46.9
  total_cost: 0.0000
+  
 - dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
  test_cases: 133
  model: WizardLM-2 8x22B
@ -406,7 +432,8 @@

 - dirname: 2024-05-13-17-39-05--gpt-4o-diff
  test_cases: 133
-  model: openai/gpt-4o
+  model: gpt-4o
+  released: 2024-05-13
  edit_format: diff
  commit_hash: b6cd852
  pass_rate_1: 60.2
@ -426,3 +453,25 @@
  seconds_per_case: 6.0
  total_cost: 0.0000

+- dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff
+  test_cases: 33
+  model: gpt-4-turbo-2024-04-09 (diff)
+  edit_format: diff
+  commit_hash: 9b2e697-dirty
+  pass_rate_1: 48.5
+  pass_rate_2: 57.6
+  percent_cases_well_formed: 100.0
+  error_outputs: 15
+  num_malformed_responses: 0
+  user_asks: 15
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 0
+  command: aider --model gpt-4-turbo-2024-04-09
+  date: 2024-04-12
+  versions: 0.28.1-dev
+  seconds_per_case: 17.6
+  total_cost: 1.6205
+  
--- a/_data/refactor_leaderboard.yml
+++ b/_data/refactor_leaderboard.yml
@ -40,7 +40,7 @@
  total_cost: 27.9176
 - dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
  test_cases: 88
-  model: gpt-4-turbo-2024-04-09
+  model: gpt-4-turbo-2024-04-09 (udiff)
  edit_format: udiff
  commit_hash: b75fdb9
  pass_rate_1: 34.1
@ -103,7 +103,7 @@

 - dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
  test_cases: 89
-  model: openai/gpt-4o
+  model: gpt-4o
  edit_format: diff
  commit_hash: b6cd852
  pass_rate_1: 62.9
@ -120,4 +120,27 @@
  date: 2024-05-13
  versions: 0.34.1-dev
  seconds_per_case: 27.8
-  total_cost: 0.0000
+  total_cost: 0.0000
+
+- dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff
+  test_cases: 88
+  model: gpt-4-turbo-2024-04-09 (diff)
+  edit_format: diff
+  commit_hash: 7875418
+  pass_rate_1: 21.4
+  percent_cases_well_formed: 6.8
+  error_outputs: 247
+  num_malformed_responses: 82
+  user_asks: 1
+  lazy_comments: 2
+  syntax_errors: 3
+  indentation_errors: 8
+  exhausted_context_windows: 0
+  test_timeouts: 0
+  command: aider --model gpt-4-turbo-2024-04-09
+  date: 2024-04-10
+  versions: 0.28.1-dev
+  seconds_per_case: 67.8
+  total_cost: 20.4889
+
+  
--- a/aider/args.py
+++ b/aider/args.py
@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root):
        const=gpt_4_model,
        help=f"Use {gpt_4_model} model for the main chat",
    )
-    gpt_4o_model = "openai/gpt-4o"
+    gpt_4o_model = "gpt-4o"
    group.add_argument(
        "--4o",
        action="store_const",
--- a/aider/models.py
+++ b/aider/models.py
@ -11,7 +11,7 @@ from PIL import Image
 from aider.dump import dump  # noqa: F401
 from aider.litellm import litellm

-DEFAULT_MODEL_NAME = "openai/gpt-4o"
+DEFAULT_MODEL_NAME = "gpt-4o"


@dataclass
@ -94,6 +94,16 @@ MODEL_SETTINGS = [
        lazy=True,
        reminder_as_sys_msg=True,
    ),
+    ModelSettings(
+        "gpt-4o",
+        "diff",
+        weak_model_name="gpt-3.5-turbo",
+        use_repo_map=True,
+        send_undo_reply=True,
+        accepts_images=True,
+        lazy=True,
+        reminder_as_sys_msg=True,
+    ),
    ModelSettings(
        "gpt-4-0125-preview",
        "udiff",
--- a/assets/models-over-time.svg
+++ b/assets/models-over-time.svg
--- a/benchmark/over_time.py
+++ b/benchmark/over_time.py
@ -0,0 +1,57 @@
+import matplotlib.pyplot as plt
+import yaml
+from imgcat import imgcat
+from matplotlib import rc
+
+
+def plot_over_time(yaml_file):
+    with open(yaml_file, "r") as file:
+        data = yaml.safe_load(file)
+
+    dates = []
+    pass_rates = []
+    models = []
+
+    for entry in data:
+        if "released" in entry and "pass_rate_2" in entry:
+            dates.append(entry["released"])
+            pass_rates.append(entry["pass_rate_2"])
+            models.append(entry["model"].split("(")[0].strip())
+
+    plt.rcParams["hatch.linewidth"] = 0.5
+    plt.rcParams["hatch.color"] = "#444444"
+
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.grid(axis="y", zorder=0, lw=0.2)
+    for spine in ax.spines.values():
+        spine.set_edgecolor("#DDDDDD")
+        spine.set_linewidth(0.5)
+
+    colors = [
+        "red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
+    ]
+    ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
+
+    for i, model in enumerate(models):
+        ax.annotate(
+            model,
+            (dates[i], pass_rates[i]),
+            fontsize=12,
+            alpha=0.75,
+            xytext=(5, 5),
+            textcoords="offset points",
+        )
+
+    ax.set_xlabel("Model release date", fontsize=18)
+    ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
+    ax.set_title("LLM code editing skill by model release date", fontsize=20)
+    plt.tight_layout()
+    plt.savefig("tmp_over_time.png")
+    plt.savefig("tmp_over_time.svg")
+    imgcat(fig)
+
+
+# Example usage
+plot_over_time("_data/edit_leaderboard.yml")
--- a/docs/leaderboards/index.md
+++ b/docs/leaderboards/index.md
@ -185,6 +185,10 @@ Therefore, results are available for fewer models.
 </script>


+## LLM code editing skill by model release date
+
+[![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg)
+

 ## Notes on benchmarking results

--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,4 @@
 [pytest]
-norecursedirs = tmp.* build benchmark
+norecursedirs = tmp.* build benchmark _site OLD
 addopts = -p no:warnings

--- a/scripts/Dockerfile.jekyll
+++ b/scripts/Dockerfile.jekyll
@ -0,0 +1,20 @@
+# Use the official Jekyll image from Docker Hub
+FROM jekyll/jekyll:latest
+
+# Set the working directory
+WORKDIR /srv/jekyll
+
+# Copy the current directory contents into the container at /srv/jekyll
+COPY . /srv/jekyll
+
+# Install any needed packages specified in Gemfile
+RUN bundle install
+
+# Expose port 4000 to the host
+EXPOSE 4000
+
+# Health check to ensure the server is running
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1
+
+# Run Jekyll server
+CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"]
--- a/scripts/jekyll_build.sh
+++ b/scripts/jekyll_build.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+# Build the Docker image
+docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll .
--- a/scripts/jekyll_run.sh
+++ b/scripts/jekyll_run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+# Run the Docker container
+docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site