Merge branch 'main' into swe-bench

This commit is contained in:
Paul Gauthier 2024-05-15 12:02:00 -07:00
commit df84bcf38b
15 changed files with 1945 additions and 8 deletions

3
.gitignore vendored
View file

@ -2,3 +2,6 @@
.aider*
aider_chat.egg-info/
build
Gemfile.lock
_site
.jekyll-cache/

View file

@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This
helps us to keep track of them and discuss potential solutions or
enhancements.
LLM Benchmark Results
---------------------
Contributions of
[LLM benchmark results](https://aider.chat/docs/leaderboards/)
are welcome!
See the
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
for information on running aider's code editing benchmarks.
Submit results by opening a PR with edits to the
[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
Pull Requests
-------------

5
Gemfile Normal file
View file

@ -0,0 +1,5 @@
source 'https://rubygems.org'
gem 'jekyll'
gem 'jekyll-theme-cayman'
gem 'jekyll-redirect-from'
gem 'jekyll-sitemap'

View file

@ -11,3 +11,6 @@ defaults:
type: "pages"
values:
description: "A command-line chat tool for coding with GPT"
exclude:
- tmp.benchmarks

View file

@ -1,6 +1,7 @@
- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
test_cases: 133
model: claude-3-opus-20240229
released: 2024-02-29
edit_format: diff
commit_hash: f4b1797-dirty, f4b1797
pass_rate_1: 53.4
@ -19,9 +20,11 @@
versions: 0.30.2-dev
seconds_per_case: 32.4
total_cost: 13.8395
- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
test_cases: 133
model: claude-3-sonnet-20240229
released: 2024-02-29
edit_format: whole
commit_hash: a5f8076-dirty
pass_rate_1: 43.6
@ -40,9 +43,11 @@
versions: 0.25.1-dev
seconds_per_case: 23.1
total_cost: 0.0000
- dirname: 2024-04-29-19-17-28--deepseek-coder-whole
test_cases: 132
model: deepseek-coder
released: 2024-01-25
edit_format: whole
commit_hash: c07f793-dirty
pass_rate_1: 47.0
@ -61,6 +66,7 @@
versions: 0.30.2-dev
seconds_per_case: 26.7
total_cost: 0.0000
- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
test_cases: 133
model: gemini-1.5-pro-latest
@ -86,6 +92,7 @@
- dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
test_cases: 133
model: gpt-3.5-turbo-0125
released: 2024-01-25
edit_format: whole
commit_hash: 1d55f74
pass_rate_1: 41.4
@ -108,6 +115,7 @@
- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
test_cases: 133
model: gpt-3.5-turbo-0301
released: 2023-03-01
edit_format: whole
commit_hash: 44388db-dirty
pass_rate_1: 50.4
@ -126,9 +134,11 @@
versions: 0.16.4-dev
seconds_per_case: 6.5
total_cost: 0.4822
- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
test_cases: 133
model: gpt-3.5-turbo-0613
released: 2023-06-13
edit_format: whole
commit_hash: 93aa497-dirty
pass_rate_1: 38.3
@ -168,9 +178,11 @@
versions: 0.30.2-dev
seconds_per_case: 5.3
total_cost: 0.3261
- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
test_cases: 133
model: gpt-4-0125-preview
released: 2024-01-25
edit_format: udiff
commit_hash: edcf9b1
pass_rate_1: 55.6
@ -189,9 +201,11 @@
versions: 0.22.1-dev
seconds_per_case: 44.8
total_cost: 14.6428
- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
test_cases: 133
model: gpt-4-0314
released: 2023-03-14
edit_format: diff
commit_hash: 0d43468
pass_rate_1: 50.4
@ -210,9 +224,11 @@
versions: 0.31.2-dev
seconds_per_case: 19.8
total_cost: 16.2689
- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
test_cases: 133
model: gpt-4-0613
released: 2023-06-13
edit_format: diff
commit_hash: 3aa17c4
pass_rate_1: 46.6
@ -235,6 +251,7 @@
- dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
test_cases: 133
model: gpt-4-1106-preview
released: 2023-11-06
edit_format: udiff
commit_hash: 87664dc
pass_rate_1: 51.9
@ -256,7 +273,8 @@
- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
test_cases: 133
model: gpt-4-turbo-2024-04-09
model: gpt-4-turbo-2024-04-09 (udiff)
released: 2024-04-09
edit_format: udiff
commit_hash: e610e5b-dirty
pass_rate_1: 48.1
@ -275,9 +293,11 @@
versions: 0.30.2-dev
seconds_per_case: 22.8
total_cost: 6.3337
- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
test_cases: 132
model: llama3-70b-8192
released: 2024-04-18
edit_format: diff
commit_hash: b5bb453
pass_rate_1: 38.6
@ -296,9 +316,11 @@
versions: 0.31.2-dev
seconds_per_case: 14.5
total_cost: 0.4311
- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
test_cases: 133
model: command-r-plus
released: 2024-04-04
edit_format: whole
commit_hash: fc3a43e-dirty
pass_rate_1: 21.8
@ -317,6 +339,7 @@
versions: 0.31.2-dev
seconds_per_case: 22.9
total_cost: 2.7494
- dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
test_cases: 133
model: deepseek-chat v2 (whole)
@ -342,6 +365,7 @@
- dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
test_cases: 133
model: deepseek-chat v2 (diff)
released: 2024-05-06
edit_format: diff
commit_hash: 80a3f6d
pass_rate_1: 44.4
@ -364,6 +388,7 @@
- dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
test_cases: 133
model: qwen1.5-110b-chat
released: 2024-02-04
edit_format: whole
commit_hash: 70b1c0c
pass_rate_1: 30.8
@ -382,6 +407,7 @@
versions: 0.31.2-dev
seconds_per_case: 46.9
total_cost: 0.0000
- dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
test_cases: 133
model: WizardLM-2 8x22B
@ -406,7 +432,8 @@
- dirname: 2024-05-13-17-39-05--gpt-4o-diff
test_cases: 133
model: openai/gpt-4o
model: gpt-4o
released: 2024-05-13
edit_format: diff
commit_hash: b6cd852
pass_rate_1: 60.2
@ -426,3 +453,25 @@
seconds_per_case: 6.0
total_cost: 0.0000
- dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff
test_cases: 33
model: gpt-4-turbo-2024-04-09 (diff)
edit_format: diff
commit_hash: 9b2e697-dirty
pass_rate_1: 48.5
pass_rate_2: 57.6
percent_cases_well_formed: 100.0
error_outputs: 15
num_malformed_responses: 0
user_asks: 15
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4-turbo-2024-04-09
date: 2024-04-12
versions: 0.28.1-dev
seconds_per_case: 17.6
total_cost: 1.6205

View file

@ -40,7 +40,7 @@
total_cost: 27.9176
- dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
test_cases: 88
model: gpt-4-turbo-2024-04-09
model: gpt-4-turbo-2024-04-09 (udiff)
edit_format: udiff
commit_hash: b75fdb9
pass_rate_1: 34.1
@ -103,7 +103,7 @@
- dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
test_cases: 89
model: openai/gpt-4o
model: gpt-4o
edit_format: diff
commit_hash: b6cd852
pass_rate_1: 62.9
@ -120,4 +120,27 @@
date: 2024-05-13
versions: 0.34.1-dev
seconds_per_case: 27.8
total_cost: 0.0000
total_cost: 0.0000
- dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff
test_cases: 88
model: gpt-4-turbo-2024-04-09 (diff)
edit_format: diff
commit_hash: 7875418
pass_rate_1: 21.4
percent_cases_well_formed: 6.8
error_outputs: 247
num_malformed_responses: 82
user_asks: 1
lazy_comments: 2
syntax_errors: 3
indentation_errors: 8
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4-turbo-2024-04-09
date: 2024-04-10
versions: 0.28.1-dev
seconds_per_case: 67.8
total_cost: 20.4889

View file

@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root):
const=gpt_4_model,
help=f"Use {gpt_4_model} model for the main chat",
)
gpt_4o_model = "openai/gpt-4o"
gpt_4o_model = "gpt-4o"
group.add_argument(
"--4o",
action="store_const",

View file

@ -11,7 +11,7 @@ from PIL import Image
from aider.dump import dump # noqa: F401
from aider.litellm import litellm
DEFAULT_MODEL_NAME = "openai/gpt-4o"
DEFAULT_MODEL_NAME = "gpt-4o"
@dataclass
@ -94,6 +94,16 @@ MODEL_SETTINGS = [
lazy=True,
reminder_as_sys_msg=True,
),
ModelSettings(
"gpt-4o",
"diff",
weak_model_name="gpt-3.5-turbo",
use_repo_map=True,
send_undo_reply=True,
accepts_images=True,
lazy=True,
reminder_as_sys_msg=True,
),
ModelSettings(
"gpt-4-0125-preview",
"udiff",

1742
assets/models-over-time.svg Normal file

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 53 KiB

57
benchmark/over_time.py Normal file
View file

@ -0,0 +1,57 @@
import matplotlib.pyplot as plt
import yaml
from imgcat import imgcat
from matplotlib import rc
def plot_over_time(yaml_file):
with open(yaml_file, "r") as file:
data = yaml.safe_load(file)
dates = []
pass_rates = []
models = []
for entry in data:
if "released" in entry and "pass_rate_2" in entry:
dates.append(entry["released"])
pass_rates.append(entry["pass_rate_2"])
models.append(entry["model"].split("(")[0].strip())
plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444"
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
fig, ax = plt.subplots(figsize=(10, 5))
ax.grid(axis="y", zorder=0, lw=0.2)
for spine in ax.spines.values():
spine.set_edgecolor("#DDDDDD")
spine.set_linewidth(0.5)
colors = [
"red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
]
ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
for i, model in enumerate(models):
ax.annotate(
model,
(dates[i], pass_rates[i]),
fontsize=12,
alpha=0.75,
xytext=(5, 5),
textcoords="offset points",
)
ax.set_xlabel("Model release date", fontsize=18)
ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
ax.set_title("LLM code editing skill by model release date", fontsize=20)
plt.tight_layout()
plt.savefig("tmp_over_time.png")
plt.savefig("tmp_over_time.svg")
imgcat(fig)
# Example usage
plot_over_time("_data/edit_leaderboard.yml")

View file

@ -185,6 +185,10 @@ Therefore, results are available for fewer models.
</script>
## LLM code editing skill by model release date
[![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg)
## Notes on benchmarking results

View file

@ -1,4 +1,4 @@
[pytest]
norecursedirs = tmp.* build benchmark
norecursedirs = tmp.* build benchmark _site OLD
addopts = -p no:warnings

20
scripts/Dockerfile.jekyll Normal file
View file

@ -0,0 +1,20 @@
# Use the official Jekyll image from Docker Hub
FROM jekyll/jekyll:latest
# Set the working directory
WORKDIR /srv/jekyll
# Copy the current directory contents into the container at /srv/jekyll
COPY . /srv/jekyll
# Install any needed packages specified in Gemfile
RUN bundle install
# Expose port 4000 to the host
EXPOSE 4000
# Health check to ensure the server is running
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1
# Run Jekyll server
CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"]

4
scripts/jekyll_build.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/bash
# Build the Docker image
docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll .

4
scripts/jekyll_run.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/bash
# Run the Docker container
docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site