mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-30 01:04:59 +00:00
Merge branch 'main' into swe-bench
This commit is contained in:
commit
df84bcf38b
15 changed files with 1945 additions and 8 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,3 +2,6 @@
|
||||||
.aider*
|
.aider*
|
||||||
aider_chat.egg-info/
|
aider_chat.egg-info/
|
||||||
build
|
build
|
||||||
|
Gemfile.lock
|
||||||
|
_site
|
||||||
|
.jekyll-cache/
|
||||||
|
|
|
@ -13,6 +13,19 @@ Please submit bug reports and feature requests as GitHub issues. This
|
||||||
helps us to keep track of them and discuss potential solutions or
|
helps us to keep track of them and discuss potential solutions or
|
||||||
enhancements.
|
enhancements.
|
||||||
|
|
||||||
|
LLM Benchmark Results
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Contributions of
|
||||||
|
[LLM benchmark results](https://aider.chat/docs/leaderboards/)
|
||||||
|
are welcome!
|
||||||
|
See the
|
||||||
|
[benchmark README](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md)
|
||||||
|
for information on running aider's code editing benchmarks.
|
||||||
|
Submit results by opening a PR with edits to the
|
||||||
|
[benchmark results data files](https://github.com/paul-gauthier/aider/blob/main/_data/).
|
||||||
|
|
||||||
|
|
||||||
Pull Requests
|
Pull Requests
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
|
5
Gemfile
Normal file
5
Gemfile
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
source 'https://rubygems.org'
|
||||||
|
gem 'jekyll'
|
||||||
|
gem 'jekyll-theme-cayman'
|
||||||
|
gem 'jekyll-redirect-from'
|
||||||
|
gem 'jekyll-sitemap'
|
|
@ -11,3 +11,6 @@ defaults:
|
||||||
type: "pages"
|
type: "pages"
|
||||||
values:
|
values:
|
||||||
description: "A command-line chat tool for coding with GPT"
|
description: "A command-line chat tool for coding with GPT"
|
||||||
|
|
||||||
|
exclude:
|
||||||
|
- tmp.benchmarks
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
|
- dirname: 2024-05-01-20-05-59--direct-opus-filenames-outside-fence
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: claude-3-opus-20240229
|
model: claude-3-opus-20240229
|
||||||
|
released: 2024-02-29
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: f4b1797-dirty, f4b1797
|
commit_hash: f4b1797-dirty, f4b1797
|
||||||
pass_rate_1: 53.4
|
pass_rate_1: 53.4
|
||||||
|
@ -19,9 +20,11 @@
|
||||||
versions: 0.30.2-dev
|
versions: 0.30.2-dev
|
||||||
seconds_per_case: 32.4
|
seconds_per_case: 32.4
|
||||||
total_cost: 13.8395
|
total_cost: 13.8395
|
||||||
|
|
||||||
- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
|
- dirname: 2024-03-06-16-42-00--claude3-sonnet-whole
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: claude-3-sonnet-20240229
|
model: claude-3-sonnet-20240229
|
||||||
|
released: 2024-02-29
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: a5f8076-dirty
|
commit_hash: a5f8076-dirty
|
||||||
pass_rate_1: 43.6
|
pass_rate_1: 43.6
|
||||||
|
@ -40,9 +43,11 @@
|
||||||
versions: 0.25.1-dev
|
versions: 0.25.1-dev
|
||||||
seconds_per_case: 23.1
|
seconds_per_case: 23.1
|
||||||
total_cost: 0.0000
|
total_cost: 0.0000
|
||||||
|
|
||||||
- dirname: 2024-04-29-19-17-28--deepseek-coder-whole
|
- dirname: 2024-04-29-19-17-28--deepseek-coder-whole
|
||||||
test_cases: 132
|
test_cases: 132
|
||||||
model: deepseek-coder
|
model: deepseek-coder
|
||||||
|
released: 2024-01-25
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: c07f793-dirty
|
commit_hash: c07f793-dirty
|
||||||
pass_rate_1: 47.0
|
pass_rate_1: 47.0
|
||||||
|
@ -61,6 +66,7 @@
|
||||||
versions: 0.30.2-dev
|
versions: 0.30.2-dev
|
||||||
seconds_per_case: 26.7
|
seconds_per_case: 26.7
|
||||||
total_cost: 0.0000
|
total_cost: 0.0000
|
||||||
|
|
||||||
- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
|
- dirname: 2024-05-03-20-47-24--gemini-1.5-pro-diff-fenced
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gemini-1.5-pro-latest
|
model: gemini-1.5-pro-latest
|
||||||
|
@ -86,6 +92,7 @@
|
||||||
- dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
|
- dirname: 2024-05-08-20-59-15--may-gpt-3.5-turbo-whole
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-3.5-turbo-0125
|
model: gpt-3.5-turbo-0125
|
||||||
|
released: 2024-01-25
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: 1d55f74
|
commit_hash: 1d55f74
|
||||||
pass_rate_1: 41.4
|
pass_rate_1: 41.4
|
||||||
|
@ -108,6 +115,7 @@
|
||||||
- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
|
- dirname: 2023-11-06-21-23-59--gpt-3.5-turbo-0301
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-3.5-turbo-0301
|
model: gpt-3.5-turbo-0301
|
||||||
|
released: 2023-03-01
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: 44388db-dirty
|
commit_hash: 44388db-dirty
|
||||||
pass_rate_1: 50.4
|
pass_rate_1: 50.4
|
||||||
|
@ -126,9 +134,11 @@
|
||||||
versions: 0.16.4-dev
|
versions: 0.16.4-dev
|
||||||
seconds_per_case: 6.5
|
seconds_per_case: 6.5
|
||||||
total_cost: 0.4822
|
total_cost: 0.4822
|
||||||
|
|
||||||
- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
|
- dirname: 2023-11-07-02-41-07--gpt-3.5-turbo-0613
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-3.5-turbo-0613
|
model: gpt-3.5-turbo-0613
|
||||||
|
released: 2023-06-13
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: 93aa497-dirty
|
commit_hash: 93aa497-dirty
|
||||||
pass_rate_1: 38.3
|
pass_rate_1: 38.3
|
||||||
|
@ -168,9 +178,11 @@
|
||||||
versions: 0.30.2-dev
|
versions: 0.30.2-dev
|
||||||
seconds_per_case: 5.3
|
seconds_per_case: 5.3
|
||||||
total_cost: 0.3261
|
total_cost: 0.3261
|
||||||
|
|
||||||
- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
|
- dirname: 2024-01-25-23-37-15--jan-exercism-gpt-4-0125-preview-udiff
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-4-0125-preview
|
model: gpt-4-0125-preview
|
||||||
|
released: 2024-01-25
|
||||||
edit_format: udiff
|
edit_format: udiff
|
||||||
commit_hash: edcf9b1
|
commit_hash: edcf9b1
|
||||||
pass_rate_1: 55.6
|
pass_rate_1: 55.6
|
||||||
|
@ -189,9 +201,11 @@
|
||||||
versions: 0.22.1-dev
|
versions: 0.22.1-dev
|
||||||
seconds_per_case: 44.8
|
seconds_per_case: 44.8
|
||||||
total_cost: 14.6428
|
total_cost: 14.6428
|
||||||
|
|
||||||
- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
|
- dirname: 2024-05-04-15-07-30--redo-gpt-4-0314-diff-reminder-rules
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-4-0314
|
model: gpt-4-0314
|
||||||
|
released: 2023-03-14
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: 0d43468
|
commit_hash: 0d43468
|
||||||
pass_rate_1: 50.4
|
pass_rate_1: 50.4
|
||||||
|
@ -210,9 +224,11 @@
|
||||||
versions: 0.31.2-dev
|
versions: 0.31.2-dev
|
||||||
seconds_per_case: 19.8
|
seconds_per_case: 19.8
|
||||||
total_cost: 16.2689
|
total_cost: 16.2689
|
||||||
|
|
||||||
- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
|
- dirname: 2023-12-16-21-24-28--editblock-gpt-4-0613-actual-main
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-4-0613
|
model: gpt-4-0613
|
||||||
|
released: 2023-06-13
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: 3aa17c4
|
commit_hash: 3aa17c4
|
||||||
pass_rate_1: 46.6
|
pass_rate_1: 46.6
|
||||||
|
@ -235,6 +251,7 @@
|
||||||
- dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
|
- dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-4-1106-preview
|
model: gpt-4-1106-preview
|
||||||
|
released: 2023-11-06
|
||||||
edit_format: udiff
|
edit_format: udiff
|
||||||
commit_hash: 87664dc
|
commit_hash: 87664dc
|
||||||
pass_rate_1: 51.9
|
pass_rate_1: 51.9
|
||||||
|
@ -256,7 +273,8 @@
|
||||||
|
|
||||||
- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
|
- dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: gpt-4-turbo-2024-04-09
|
model: gpt-4-turbo-2024-04-09 (udiff)
|
||||||
|
released: 2024-04-09
|
||||||
edit_format: udiff
|
edit_format: udiff
|
||||||
commit_hash: e610e5b-dirty
|
commit_hash: e610e5b-dirty
|
||||||
pass_rate_1: 48.1
|
pass_rate_1: 48.1
|
||||||
|
@ -275,9 +293,11 @@
|
||||||
versions: 0.30.2-dev
|
versions: 0.30.2-dev
|
||||||
seconds_per_case: 22.8
|
seconds_per_case: 22.8
|
||||||
total_cost: 6.3337
|
total_cost: 6.3337
|
||||||
|
|
||||||
- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
|
- dirname: 2024-05-03-22-24-48--openrouter--llama3-diff-examples-sys-msg
|
||||||
test_cases: 132
|
test_cases: 132
|
||||||
model: llama3-70b-8192
|
model: llama3-70b-8192
|
||||||
|
released: 2024-04-18
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: b5bb453
|
commit_hash: b5bb453
|
||||||
pass_rate_1: 38.6
|
pass_rate_1: 38.6
|
||||||
|
@ -296,9 +316,11 @@
|
||||||
versions: 0.31.2-dev
|
versions: 0.31.2-dev
|
||||||
seconds_per_case: 14.5
|
seconds_per_case: 14.5
|
||||||
total_cost: 0.4311
|
total_cost: 0.4311
|
||||||
|
|
||||||
- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
|
- dirname: 2024-05-06-18-31-08--command-r-plus-whole-final
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: command-r-plus
|
model: command-r-plus
|
||||||
|
released: 2024-04-04
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: fc3a43e-dirty
|
commit_hash: fc3a43e-dirty
|
||||||
pass_rate_1: 21.8
|
pass_rate_1: 21.8
|
||||||
|
@ -317,6 +339,7 @@
|
||||||
versions: 0.31.2-dev
|
versions: 0.31.2-dev
|
||||||
seconds_per_case: 22.9
|
seconds_per_case: 22.9
|
||||||
total_cost: 2.7494
|
total_cost: 2.7494
|
||||||
|
|
||||||
- dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
|
- dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: deepseek-chat v2 (whole)
|
model: deepseek-chat v2 (whole)
|
||||||
|
@ -342,6 +365,7 @@
|
||||||
- dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
|
- dirname: 2024-05-09-18-57-52--deepseek-chat-v2-diff-reverted-and-helpful-assistant2
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: deepseek-chat v2 (diff)
|
model: deepseek-chat v2 (diff)
|
||||||
|
released: 2024-05-06
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: 80a3f6d
|
commit_hash: 80a3f6d
|
||||||
pass_rate_1: 44.4
|
pass_rate_1: 44.4
|
||||||
|
@ -364,6 +388,7 @@
|
||||||
- dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
|
- dirname: 2024-05-07-20-32-37--qwen1.5-110b-chat-whole
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: qwen1.5-110b-chat
|
model: qwen1.5-110b-chat
|
||||||
|
released: 2024-02-04
|
||||||
edit_format: whole
|
edit_format: whole
|
||||||
commit_hash: 70b1c0c
|
commit_hash: 70b1c0c
|
||||||
pass_rate_1: 30.8
|
pass_rate_1: 30.8
|
||||||
|
@ -382,6 +407,7 @@
|
||||||
versions: 0.31.2-dev
|
versions: 0.31.2-dev
|
||||||
seconds_per_case: 46.9
|
seconds_per_case: 46.9
|
||||||
total_cost: 0.0000
|
total_cost: 0.0000
|
||||||
|
|
||||||
- dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
|
- dirname: 2024-05-07-20-57-04--wizardlm-2-8x22b-whole
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: WizardLM-2 8x22B
|
model: WizardLM-2 8x22B
|
||||||
|
@ -406,7 +432,8 @@
|
||||||
|
|
||||||
- dirname: 2024-05-13-17-39-05--gpt-4o-diff
|
- dirname: 2024-05-13-17-39-05--gpt-4o-diff
|
||||||
test_cases: 133
|
test_cases: 133
|
||||||
model: openai/gpt-4o
|
model: gpt-4o
|
||||||
|
released: 2024-05-13
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: b6cd852
|
commit_hash: b6cd852
|
||||||
pass_rate_1: 60.2
|
pass_rate_1: 60.2
|
||||||
|
@ -426,3 +453,25 @@
|
||||||
seconds_per_case: 6.0
|
seconds_per_case: 6.0
|
||||||
total_cost: 0.0000
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2024-04-12-22-18-20--gpt-4-turbo-2024-04-09-plain-diff
|
||||||
|
test_cases: 33
|
||||||
|
model: gpt-4-turbo-2024-04-09 (diff)
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: 9b2e697-dirty
|
||||||
|
pass_rate_1: 48.5
|
||||||
|
pass_rate_2: 57.6
|
||||||
|
percent_cases_well_formed: 100.0
|
||||||
|
error_outputs: 15
|
||||||
|
num_malformed_responses: 0
|
||||||
|
user_asks: 15
|
||||||
|
lazy_comments: 0
|
||||||
|
syntax_errors: 0
|
||||||
|
indentation_errors: 0
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 0
|
||||||
|
command: aider --model gpt-4-turbo-2024-04-09
|
||||||
|
date: 2024-04-12
|
||||||
|
versions: 0.28.1-dev
|
||||||
|
seconds_per_case: 17.6
|
||||||
|
total_cost: 1.6205
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
total_cost: 27.9176
|
total_cost: 27.9176
|
||||||
- dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
|
- dirname: 2024-04-09-21-49-54--refac-gpt-4-turbo-2024-04-09
|
||||||
test_cases: 88
|
test_cases: 88
|
||||||
model: gpt-4-turbo-2024-04-09
|
model: gpt-4-turbo-2024-04-09 (udiff)
|
||||||
edit_format: udiff
|
edit_format: udiff
|
||||||
commit_hash: b75fdb9
|
commit_hash: b75fdb9
|
||||||
pass_rate_1: 34.1
|
pass_rate_1: 34.1
|
||||||
|
@ -103,7 +103,7 @@
|
||||||
|
|
||||||
- dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
|
- dirname: 2024-05-13-17-42-22--refac-gpt-4o-diff
|
||||||
test_cases: 89
|
test_cases: 89
|
||||||
model: openai/gpt-4o
|
model: gpt-4o
|
||||||
edit_format: diff
|
edit_format: diff
|
||||||
commit_hash: b6cd852
|
commit_hash: b6cd852
|
||||||
pass_rate_1: 62.9
|
pass_rate_1: 62.9
|
||||||
|
@ -121,3 +121,26 @@
|
||||||
versions: 0.34.1-dev
|
versions: 0.34.1-dev
|
||||||
seconds_per_case: 27.8
|
seconds_per_case: 27.8
|
||||||
total_cost: 0.0000
|
total_cost: 0.0000
|
||||||
|
|
||||||
|
- dirname: 2024-04-10-13-26-18--refac-gpt-4-turbo-2024-04-09-diff
|
||||||
|
test_cases: 88
|
||||||
|
model: gpt-4-turbo-2024-04-09 (diff)
|
||||||
|
edit_format: diff
|
||||||
|
commit_hash: 7875418
|
||||||
|
pass_rate_1: 21.4
|
||||||
|
percent_cases_well_formed: 6.8
|
||||||
|
error_outputs: 247
|
||||||
|
num_malformed_responses: 82
|
||||||
|
user_asks: 1
|
||||||
|
lazy_comments: 2
|
||||||
|
syntax_errors: 3
|
||||||
|
indentation_errors: 8
|
||||||
|
exhausted_context_windows: 0
|
||||||
|
test_timeouts: 0
|
||||||
|
command: aider --model gpt-4-turbo-2024-04-09
|
||||||
|
date: 2024-04-10
|
||||||
|
versions: 0.28.1-dev
|
||||||
|
seconds_per_case: 67.8
|
||||||
|
total_cost: 20.4889
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,7 @@ def get_parser(default_config_files, git_root):
|
||||||
const=gpt_4_model,
|
const=gpt_4_model,
|
||||||
help=f"Use {gpt_4_model} model for the main chat",
|
help=f"Use {gpt_4_model} model for the main chat",
|
||||||
)
|
)
|
||||||
gpt_4o_model = "openai/gpt-4o"
|
gpt_4o_model = "gpt-4o"
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--4o",
|
"--4o",
|
||||||
action="store_const",
|
action="store_const",
|
||||||
|
|
|
@ -11,7 +11,7 @@ from PIL import Image
|
||||||
from aider.dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
from aider.litellm import litellm
|
from aider.litellm import litellm
|
||||||
|
|
||||||
DEFAULT_MODEL_NAME = "openai/gpt-4o"
|
DEFAULT_MODEL_NAME = "gpt-4o"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -94,6 +94,16 @@ MODEL_SETTINGS = [
|
||||||
lazy=True,
|
lazy=True,
|
||||||
reminder_as_sys_msg=True,
|
reminder_as_sys_msg=True,
|
||||||
),
|
),
|
||||||
|
ModelSettings(
|
||||||
|
"gpt-4o",
|
||||||
|
"diff",
|
||||||
|
weak_model_name="gpt-3.5-turbo",
|
||||||
|
use_repo_map=True,
|
||||||
|
send_undo_reply=True,
|
||||||
|
accepts_images=True,
|
||||||
|
lazy=True,
|
||||||
|
reminder_as_sys_msg=True,
|
||||||
|
),
|
||||||
ModelSettings(
|
ModelSettings(
|
||||||
"gpt-4-0125-preview",
|
"gpt-4-0125-preview",
|
||||||
"udiff",
|
"udiff",
|
||||||
|
|
1742
assets/models-over-time.svg
Normal file
1742
assets/models-over-time.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 53 KiB |
57
benchmark/over_time.py
Normal file
57
benchmark/over_time.py
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import yaml
|
||||||
|
from imgcat import imgcat
|
||||||
|
from matplotlib import rc
|
||||||
|
|
||||||
|
|
||||||
|
def plot_over_time(yaml_file):
|
||||||
|
with open(yaml_file, "r") as file:
|
||||||
|
data = yaml.safe_load(file)
|
||||||
|
|
||||||
|
dates = []
|
||||||
|
pass_rates = []
|
||||||
|
models = []
|
||||||
|
|
||||||
|
for entry in data:
|
||||||
|
if "released" in entry and "pass_rate_2" in entry:
|
||||||
|
dates.append(entry["released"])
|
||||||
|
pass_rates.append(entry["pass_rate_2"])
|
||||||
|
models.append(entry["model"].split("(")[0].strip())
|
||||||
|
|
||||||
|
plt.rcParams["hatch.linewidth"] = 0.5
|
||||||
|
plt.rcParams["hatch.color"] = "#444444"
|
||||||
|
|
||||||
|
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 5))
|
||||||
|
ax.grid(axis="y", zorder=0, lw=0.2)
|
||||||
|
for spine in ax.spines.values():
|
||||||
|
spine.set_edgecolor("#DDDDDD")
|
||||||
|
spine.set_linewidth(0.5)
|
||||||
|
|
||||||
|
colors = [
|
||||||
|
"red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
|
||||||
|
]
|
||||||
|
ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
|
||||||
|
|
||||||
|
for i, model in enumerate(models):
|
||||||
|
ax.annotate(
|
||||||
|
model,
|
||||||
|
(dates[i], pass_rates[i]),
|
||||||
|
fontsize=12,
|
||||||
|
alpha=0.75,
|
||||||
|
xytext=(5, 5),
|
||||||
|
textcoords="offset points",
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.set_xlabel("Model release date", fontsize=18)
|
||||||
|
ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
|
||||||
|
ax.set_title("LLM code editing skill by model release date", fontsize=20)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig("tmp_over_time.png")
|
||||||
|
plt.savefig("tmp_over_time.svg")
|
||||||
|
imgcat(fig)
|
||||||
|
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
plot_over_time("_data/edit_leaderboard.yml")
|
|
@ -185,6 +185,10 @@ Therefore, results are available for fewer models.
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|
||||||
|
## LLM code editing skill by model release date
|
||||||
|
|
||||||
|
[](https://aider.chat/assets/models-over-time.svg)
|
||||||
|
|
||||||
|
|
||||||
## Notes on benchmarking results
|
## Notes on benchmarking results
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
[pytest]
|
[pytest]
|
||||||
norecursedirs = tmp.* build benchmark
|
norecursedirs = tmp.* build benchmark _site OLD
|
||||||
addopts = -p no:warnings
|
addopts = -p no:warnings
|
||||||
|
|
||||||
|
|
20
scripts/Dockerfile.jekyll
Normal file
20
scripts/Dockerfile.jekyll
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# Use the official Jekyll image from Docker Hub
|
||||||
|
FROM jekyll/jekyll:latest
|
||||||
|
|
||||||
|
# Set the working directory
|
||||||
|
WORKDIR /srv/jekyll
|
||||||
|
|
||||||
|
# Copy the current directory contents into the container at /srv/jekyll
|
||||||
|
COPY . /srv/jekyll
|
||||||
|
|
||||||
|
# Install any needed packages specified in Gemfile
|
||||||
|
RUN bundle install
|
||||||
|
|
||||||
|
# Expose port 4000 to the host
|
||||||
|
EXPOSE 4000
|
||||||
|
|
||||||
|
# Health check to ensure the server is running
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s CMD curl -f http://localhost:4000 || exit 1
|
||||||
|
|
||||||
|
# Run Jekyll server
|
||||||
|
CMD ["jekyll", "serve", "--host", "0.0.0.0", "--port", "4000", "--verbose"]
|
4
scripts/jekyll_build.sh
Executable file
4
scripts/jekyll_build.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Build the Docker image
|
||||||
|
docker build -t my-jekyll-site -f scripts/Dockerfile.jekyll .
|
4
scripts/jekyll_run.sh
Executable file
4
scripts/jekyll_run.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Run the Docker container
|
||||||
|
docker run --rm --network="host" -v "$PWD:/srv/jekyll" -p 4000:4000 --entrypoint /bin/bash -it my-jekyll-site
|
Loading…
Add table
Add a link
Reference in a new issue