mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-29 08:44:59 +00:00
Merge branch 'main' into swe-bench
This commit is contained in:
commit
df84bcf38b
15 changed files with 1945 additions and 8 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,3 +2,6 @@
|
|||
.aider*
|
||||
aider_chat.egg-info/
|
||||
build
|
||||
Gemfile.lock
|
||||
_site
|
||||
.jekyll-cache/
|
||||
|
|
|
@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This
|
|||
helps us to keep track of them and discuss potential solutions or
|
||||
enhancements.
|
||||
|
||||
LLM Benchmark Results
|
||||
---------------------
|
||||
|
||||
Contributions of
|
||||
[LLM benchmark results](https://aider.chat/docs/leaderboards/)
|
||||
are welcome!
|
||||
See the
|
||||
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
|
||||
for information on running aider's code editing benchmarks.
|
||||
Submit results by opening a PR with edits to the
|
||||
[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
|
||||
|
||||
|
||||
Pull Requests
|
||||
-------------
|
||||
|
||||
|
|
5
Gemfile
Normal file
5
Gemfile
Normal file
|
@ -0,0 +1,5 @@
|
|||
source 'https://rubygems.org'
|
||||
gem 'jekyll'
|
||||
gem 'jekyll-theme-cayman'
|
||||
gem 'jekyll-redirect-from'
|
||||
gem 'jekyll-sitemap'
|
|
@ -11,3 +11,6 @@ defaults:
|
|||
type: "pages"
|
||||
values:
|
||||
description: "A command-line chat tool for coding with GPT"
|
||||
|
||||
exclude:
|
||||
- tmp.benchmarks
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
|
||||
test_cases: 133
|
||||
model: claude-3-opus-20240229
|
||||
released: 2024-02-29
|
||||
edit_format: diff
|
||||
commit_hash: f4b1797-dirty, f4b1797
|
||||
pass_rate_1: 53.4
|
||||
|
@ -19,9 +20,11 @@
|
|||
versions: 0.30.2-dev
|
||||
seconds_per_case: 32.4
|
||||
total_cost: 13.8395
|
||||
|
||||
- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
|
||||
test_cases: 133
|
||||
model: claude-3-sonnet-20240229
|
||||
released: 2024-02-29
|
||||
edit_format: whole
|
||||
commit_hash: a5f8076-dirty
|
||||
pass_rate_1: 43.6
|
||||
|
@ -40,9 +43,11 @@
|
|||
versions: 0.25.1-dev
|
||||
seconds_per_case: 23.1
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-04-29-19-17-28--deepseek-coder-whole
|
||||
test_cases: 132
|
||||
model: deepseek-coder
|
||||
released: 2024-01-25
|
||||
edit_format: whole
|
||||
commit_hash: c07f793-dirty
|
||||
pass_rate_1: 47.0
|
||||
|
@ -61,6 +66,7 @@
|
|||
versions: 0.30.2-dev
|
||||
seconds_per_case: 26.7
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
|
||||
test_cases: 133
|
||||
model: gemini-1.5-pro-latest
|
||||
|
@ -86,6 +92,7 @@
|
|||
- dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
|
||||
test_cases: 133
|
||||
model: gpt-3.5-turbo-0125
|
||||
released: 2024-01-25
|
||||
edit_format: whole
|
||||
commit_hash: 1d55f74
|
||||
pass_rate_1: 41.4
|
||||
|
@ -108,6 +115,7 @@
|
|||
- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
|
||||
test_cases: 133
|
||||
model: gpt-3.5-turbo-0301
|
||||
released: 2023-03-01
|
||||
edit_format: whole
|
||||
commit_hash: 44388db-dirty
|
||||
pass_rate_1: 50.4
|
||||
|
@ -126,9 +134,11 @@
|
|||
versions: 0.16.4-dev
|
||||
seconds_per_case: 6.5
|
||||
total_cost: 0.4822
|
||||
|
||||
- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
|
||||
test_cases: 133
|
||||
model: gpt-3.5-turbo-0613
|
||||
released: 2023-06-13
|
||||
edit_format: whole
|
||||
commit_hash: 93aa497-dirty
|
||||
pass_rate_1: 38.3
|
||||
|
@ -168,9 +178,11 @@
|
|||
versions: 0.30.2-dev
|
||||
seconds_per_case: 5.3
|
||||
total_cost: 0.3261
|
||||
|
||||
- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
|
||||
test_cases: 133
|
||||
model: gpt-4-0125-preview
|
||||
released: 2024-01-25
|
||||
edit_format: udiff
|
||||
commit_hash: edcf9b1
|
||||
pass_rate_1: 55.6
|
||||
|
@ -189,9 +201,11 @@
|
|||
versions: 0.22.1-dev
|
||||
seconds_per_case: 44.8
|
||||
total_cost: 14.6428
|
||||
|
||||
- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
|
||||
test_cases: 133
|
||||
model: gpt-4-0314
|
||||
released: 2023-03-14
|
||||
edit_format: diff
|
||||
commit_hash: 0d43468
|
||||
pass_rate_1: 50.4
|
||||
|
@ -210,9 +224,11 @@
|
|||
versions: 0.31.2-dev
|
||||
seconds_per_case: 19.8
|
||||
total_cost: 16.2689
|
||||
|
||||
- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
|
||||
test_cases: 133
|
||||
model: gpt-4-0613
|
||||
released: 2023-06-13
|
||||
edit_format: diff
|
||||
commit_hash: 3aa17c4
|
||||
pass_rate_1: 46.6
|
||||
|
@ -235,6 +251,7 @@
|
|||
- dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
|
||||
test_cases: 133
|
||||
model: gpt-4-1106-preview
|
||||
released: 2023-11-06
|
||||
edit_format: udiff
|
||||
commit_hash: 87664dc
|
||||
pass_rate_1: 51.9
|
||||
|
@ -256,7 +273,8 @@
|
|||
|
||||
- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
|
||||
test_cases: 133
|
||||
model: gpt-4-turbo-2024-04-09
|
||||
model: gpt-4-turbo-2024-04-09 (udiff)
|
||||
released: 2024-04-09
|
||||
edit_format: udiff
|
||||
commit_hash: e610e5b-dirty
|
||||
pass_rate_1: 48.1
|
||||
|
@ -275,9 +293,11 @@
|
|||
versions: 0.30.2-dev
|
||||
seconds_per_case: 22.8
|
||||
total_cost: 6.3337
|
||||
|
||||
- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
|
||||
test_cases: 132
|
||||
model: llama3-70b-8192
|
||||
released: 2024-04-18
|
||||
edit_format: diff
|
||||
commit_hash: b5bb453
|
||||
pass_rate_1: 38.6
|
||||
|
@ -296,9 +316,11 @@
|
|||
versions: 0.31.2-dev
|
||||
seconds_per_case: 14.5
|
||||
total_cost: 0.4311
|
||||
|
||||
- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
|
||||
test_cases: 133
|
||||
model: command-r-plus
|
||||
released: 2024-04-04
|
||||
edit_format: whole
|
||||
commit_hash: fc3a43e-dirty
|
||||
pass_rate_1: 21.8
|
||||
|
@ -317,6 +339,7 @@
|
|||
versions: 0.31.2-dev
|
||||
seconds_per_case: 22.9
|
||||
total_cost: 2.7494
|
||||
|
||||
- dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
|
||||
test_cases: 133
|
||||
model: deepseek-chat v2 (whole)
|
||||
|
@ -342,6 +365,7 @@
|
|||
- dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
|
||||
test_cases: 133
|
||||
model: deepseek-chat v2 (diff)
|
||||
released: 2024-05-06
|
||||
edit_format: diff
|
||||
commit_hash: 80a3f6d
|
||||
pass_rate_1: 44.4
|
||||
|
@ -364,6 +388,7 @@
|
|||
- dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
|
||||
test_cases: 133
|
||||
model: qwen1.5-110b-chat
|
||||
released: 2024-02-04
|
||||
edit_format: whole
|
||||
commit_hash: 70b1c0c
|
||||
pass_rate_1: 30.8
|
||||
|
@ -382,6 +407,7 @@
|
|||
versions: 0.31.2-dev
|
||||
seconds_per_case: 46.9
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
|
||||
test_cases: 133
|
||||
model: WizardLM-2 8x22B
|
||||
|
@ -406,7 +432,8 @@
|
|||
|
||||
- dirname: 2024-05-13-17-39-05--gpt-4o-diff
|
||||
test_cases: 133
|
||||
model: openai/gpt-4o
|
||||
model: gpt-4o
|
||||
released: 2024-05-13
|
||||
edit_format: diff
|
||||
commit_hash: b6cd852
|
||||
pass_rate_1: 60.2
|
||||
|
@ -426,3 +453,25 @@
|
|||
seconds_per_case: 6.0
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff
|
||||
test_cases: 33
|
||||
model: gpt-4-turbo-2024-04-09 (diff)
|
||||
edit_format: diff
|
||||
commit_hash: 9b2e697-dirty
|
||||
pass_rate_1: 48.5
|
||||
pass_rate_2: 57.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 15
|
||||
num_malformed_responses: 0
|
||||
user_asks: 15
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4-turbo-2024-04-09
|
||||
date: 2024-04-12
|
||||
versions: 0.28.1-dev
|
||||
seconds_per_case: 17.6
|
||||
total_cost: 1.6205
|
||||
|
|
@ -40,7 +40,7 @@
|
|||
total_cost: 27.9176
|
||||
- dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
|
||||
test_cases: 88
|
||||
model: gpt-4-turbo-2024-04-09
|
||||
model: gpt-4-turbo-2024-04-09 (udiff)
|
||||
edit_format: udiff
|
||||
commit_hash: b75fdb9
|
||||
pass_rate_1: 34.1
|
||||
|
@ -103,7 +103,7 @@
|
|||
|
||||
- dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
|
||||
test_cases: 89
|
||||
model: openai/gpt-4o
|
||||
model: gpt-4o
|
||||
edit_format: diff
|
||||
commit_hash: b6cd852
|
||||
pass_rate_1: 62.9
|
||||
|
@ -121,3 +121,26 @@
|
|||
versions: 0.34.1-dev
|
||||
seconds_per_case: 27.8
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff
|
||||
test_cases: 88
|
||||
model: gpt-4-turbo-2024-04-09 (diff)
|
||||
edit_format: diff
|
||||
commit_hash: 7875418
|
||||
pass_rate_1: 21.4
|
||||
percent_cases_well_formed: 6.8
|
||||
error_outputs: 247
|
||||
num_malformed_responses: 82
|
||||
user_asks: 1
|
||||
lazy_comments: 2
|
||||
syntax_errors: 3
|
||||
indentation_errors: 8
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4-turbo-2024-04-09
|
||||
date: 2024-04-10
|
||||
versions: 0.28.1-dev
|
||||
seconds_per_case: 67.8
|
||||
total_cost: 20.4889
|
||||
|
||||
|
|
@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root):
|
|||
const=gpt_4_model,
|
||||
help=f"Use {gpt_4_model} model for the main chat",
|
||||
)
|
||||
gpt_4o_model = "openai/gpt-4o"
|
||||
gpt_4o_model = "gpt-4o"
|
||||
group.add_argument(
|
||||
"--4o",
|
||||
action="store_const",
|
||||
|
|
|
@ -11,7 +11,7 @@ from PIL import Image
|
|||
from aider.dump import dump # noqa: F401
|
||||
from aider.litellm import litellm
|
||||
|
||||
DEFAULT_MODEL_NAME = "openai/gpt-4o"
|
||||
DEFAULT_MODEL_NAME = "gpt-4o"
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -94,6 +94,16 @@ MODEL_SETTINGS = [
|
|||
lazy=True,
|
||||
reminder_as_sys_msg=True,
|
||||
),
|
||||
ModelSettings(
|
||||
"gpt-4o",
|
||||
"diff",
|
||||
weak_model_name="gpt-3.5-turbo",
|
||||
use_repo_map=True,
|
||||
send_undo_reply=True,
|
||||
accepts_images=True,
|
||||
lazy=True,
|
||||
reminder_as_sys_msg=True,
|
||||
),
|
||||
ModelSettings(
|
||||
"gpt-4-0125-preview",
|
||||
"udiff",
|
||||
|
|
1742
assets/models-over-time.svg
Normal file
1742
assets/models-over-time.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 53 KiB |
57
benchmark/over_time.py
Normal file
57
benchmark/over_time.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import yaml
|
||||
from imgcat import imgcat
|
||||
from matplotlib import rc
|
||||
|
||||
|
||||
def plot_over_time(yaml_file):
|
||||
with open(yaml_file, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
dates = []
|
||||
pass_rates = []
|
||||
models = []
|
||||
|
||||
for entry in data:
|
||||
if "released" in entry and "pass_rate_2" in entry:
|
||||
dates.append(entry["released"])
|
||||
pass_rates.append(entry["pass_rate_2"])
|
||||
models.append(entry["model"].split("(")[0].strip())
|
||||
|
||||
plt.rcParams["hatch.linewidth"] = 0.5
|
||||
plt.rcParams["hatch.color"] = "#444444"
|
||||
|
||||
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
ax.grid(axis="y", zorder=0, lw=0.2)
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor("#DDDDDD")
|
||||
spine.set_linewidth(0.5)
|
||||
|
||||
colors = [
|
||||
"red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
|
||||
]
|
||||
ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
|
||||
|
||||
for i, model in enumerate(models):
|
||||
ax.annotate(
|
||||
model,
|
||||
(dates[i], pass_rates[i]),
|
||||
fontsize=12,
|
||||
alpha=0.75,
|
||||
xytext=(5, 5),
|
||||
textcoords="offset points",
|
||||
)
|
||||
|
||||
ax.set_xlabel("Model release date", fontsize=18)
|
||||
ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
|
||||
ax.set_title("LLM code editing skill by model release date", fontsize=20)
|
||||
plt.tight_layout()
|
||||
plt.savefig("tmp_over_time.png")
|
||||
plt.savefig("tmp_over_time.svg")
|
||||
imgcat(fig)
|
||||
|
||||
|
||||
# Example usage
|
||||
plot_over_time("_data/edit_leaderboard.yml")
|
|
@ -185,6 +185,10 @@ Therefore, results are available for fewer models.
|
|||
</script>
|
||||
|
||||
|
||||
## LLM code editing skill by model release date
|
||||
|
||||
[](https://aider.chat/assets/models-over-time.svg)
|
||||
|
||||
|
||||
## Notes on benchmarking results
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
[pytest]
|
||||
norecursedirs = tmp.* build benchmark
|
||||
norecursedirs = tmp.* build benchmark _site OLD
|
||||
addopts = -p no:warnings
|
||||
|
||||
|
|
20
scripts/Dockerfile.jekyll
Normal file
20
scripts/Dockerfile.jekyll
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Use the official Jekyll image from Docker Hub
|
||||
FROM jekyll/jekyll:latest
|
||||
|
||||
# Set the working directory
|
||||
WORKDIR /srv/jekyll
|
||||
|
||||
# Copy the current directory contents into the container at /srv/jekyll
|
||||
COPY . /srv/jekyll
|
||||
|
||||
# Install any needed packages specified in Gemfile
|
||||
RUN bundle install
|
||||
|
||||
# Expose port 4000 to the host
|
||||
EXPOSE 4000
|
||||
|
||||
# Health check to ensure the server is running
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1
|
||||
|
||||
# Run Jekyll server
|
||||
CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"]
|
4
scripts/jekyll_build.sh
Executable file
4
scripts/jekyll_build.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Build the Docker image
|
||||
docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll .
|
4
scripts/jekyll_run.sh
Executable file
4
scripts/jekyll_run.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Run the Docker container
|
||||
docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site
|
Loading…
Add table
Add a link
Reference in a new issue