mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 18:25:00 +00:00
Merge branch 'main' into mixpanel
This commit is contained in:
commit
93b8cb9cec
24 changed files with 2527 additions and 313 deletions
|
@ -1,6 +1,15 @@
|
|||
|
||||
# Release history
|
||||
|
||||
### main branch
|
||||
|
||||
- Improved editing performance on Jupyter Notebook `.ipynb` files.
|
||||
- Work around litellm tokenizer bug for images.
|
||||
|
||||
### Aider v0.50.1
|
||||
|
||||
- Bugfix for provider API exceptions.
|
||||
|
||||
### Aider v0.50.0
|
||||
|
||||
- Infinite output for DeepSeek Coder, Mistral models in addition to Anthropic's models.
|
||||
|
|
|
@ -1 +1 @@
|
|||
__version__ = "0.50.1-dev"
|
||||
__version__ = "0.50.2-dev"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import locale
|
||||
|
@ -657,9 +658,11 @@ class Coder:
|
|||
image_messages = []
|
||||
for fname, content in self.get_abs_fnames_content():
|
||||
if is_image_file(fname):
|
||||
with open(fname, "rb") as image_file:
|
||||
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
||||
mime_type, _ = mimetypes.guess_type(fname)
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
image_url = f"data:{mime_type};base64,{content}"
|
||||
image_url = f"data:{mime_type};base64,{encoded_string}"
|
||||
rel_fname = self.get_rel_fname(fname)
|
||||
image_messages += [
|
||||
{"type": "text", "text": f"Image file: {rel_fname}"},
|
||||
|
@ -1014,7 +1017,8 @@ class Coder:
|
|||
)
|
||||
except Exception as err:
|
||||
self.io.tool_error(f"Unexpected error: {err}")
|
||||
traceback.print_exc()
|
||||
lines = traceback.format_exception(type(err), err, err.__traceback__)
|
||||
self.io.tool_error("".join(lines))
|
||||
return
|
||||
finally:
|
||||
if self.mdstream:
|
||||
|
@ -1249,6 +1253,7 @@ class Coder:
|
|||
|
||||
self.io.log_llm_history("TO LLM", format_messages(messages))
|
||||
|
||||
completion = None
|
||||
try:
|
||||
hash_object, completion = send_completion(
|
||||
model.name,
|
||||
|
|
|
@ -125,8 +125,8 @@ Every *SEARCH/REPLACE block* must use this format:
|
|||
7. The end of the replace block: >>>>>>> REPLACE
|
||||
8. The closing fence: {fence[1]}
|
||||
|
||||
Every *SEARCH* section must *EXACTLY MATCH* the existing source code, character for character, including all comments, docstrings, etc.
|
||||
|
||||
Every *SEARCH* section must *EXACTLY MATCH* the existing file content, character for character, including all comments, docstrings, etc.
|
||||
If the file contains code or other data wrapped/escaped in json/xml/quotes or other containers, you need to propose edits to the literal contents of the file, including the container markup.
|
||||
|
||||
*SEARCH/REPLACE* blocks will replace *all* matching occurrences.
|
||||
Include enough lines to make the SEARCH blocks uniquely match the lines to change.
|
||||
|
|
|
@ -729,7 +729,7 @@ class Commands:
|
|||
add = result.returncode != 0
|
||||
else:
|
||||
response = self.io.prompt_ask(
|
||||
"Add the output to the chat?\n(y/n/instructions)", default=""
|
||||
"Add the output to the chat?\n(Y/n/instructions)", default=""
|
||||
).strip()
|
||||
|
||||
if response.lower() in ["yes", "y"]:
|
||||
|
|
|
@ -328,6 +328,17 @@ def main(argv=None, input=None, output=None, force_git_root=None, return_coder=F
|
|||
parser = get_parser(default_config_files, git_root)
|
||||
args, unknown = parser.parse_known_args(argv)
|
||||
|
||||
if args.verbose:
|
||||
print("Config files search order, if no --config:")
|
||||
for file in default_config_files:
|
||||
exists = "(exists)" if Path(file).exists() else ""
|
||||
print(f" - {file} {exists}")
|
||||
|
||||
default_config_files.reverse()
|
||||
|
||||
parser = get_parser(default_config_files, git_root)
|
||||
args, unknown = parser.parse_known_args(argv)
|
||||
|
||||
# Load the .env file specified in the arguments
|
||||
loaded_dotenvs = load_dotenv_files(git_root, args.env_file)
|
||||
|
||||
|
|
|
@ -516,7 +516,11 @@ class Model:
|
|||
|
||||
def token_count(self, messages):
|
||||
if type(messages) is list:
|
||||
return litellm.token_counter(model=self.name, messages=messages)
|
||||
try:
|
||||
return litellm.token_counter(model=self.name, messages=messages)
|
||||
except Exception as err:
|
||||
print(f"Unable to count tokens: {err}")
|
||||
return 0
|
||||
|
||||
if not self.tokenizer:
|
||||
return
|
||||
|
|
|
@ -16,6 +16,15 @@ cog.out(text)
|
|||
|
||||
# Release history
|
||||
|
||||
### main branch
|
||||
|
||||
- Improved editing performance on Jupyter Notebook `.ipynb` files.
|
||||
- Work around litellm tokenizer bug for images.
|
||||
|
||||
### Aider v0.50.1
|
||||
|
||||
- Bugfix for provider API exceptions.
|
||||
|
||||
### Aider v0.50.0
|
||||
|
||||
- Infinite output for DeepSeek Coder, Mistral models in addition to Anthropic's models.
|
||||
|
|
927
aider/website/_data/code-in-json.yml
Normal file
927
aider/website/_data/code-in-json.yml
Normal file
|
@ -0,0 +1,927 @@
|
|||
- dirname: 2024-08-15-13-17-11--json-no-lint-gpt-4o-2024-08-06-whole
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 4.3
|
||||
total_cost: 0.7965
|
||||
- dirname: 2024-08-15-13-18-36--json-no-lint-gpt-4o-2024-08-06-func
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 57.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 5.7
|
||||
total_cost: 0.8417
|
||||
- dirname: 2024-08-15-13-21-55--json-no-lint-gpt-4o-2024-05-13-func
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 7.1
|
||||
total_cost: 1.2285
|
||||
- dirname: 2024-08-15-13-23-33--json-no-lint-claude-3.5-sonnet-whole
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 10.5
|
||||
total_cost: 1.6714
|
||||
- dirname: 2024-08-15-13-26-38--json-no-lint-deepseek-coder-whole
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 59.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 2
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 27.9
|
||||
total_cost: 0.0438
|
||||
- dirname: 2024-08-15-13-50-03--json-no-lint-gpt-4o-2024-08-06-whole-2
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 61.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 4.2
|
||||
total_cost: 0.7946
|
||||
- dirname: 2024-08-15-13-51-36--json-no-lint-gpt-4o-2024-08-06-func-2
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 56.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.4
|
||||
total_cost: 0.8390
|
||||
- dirname: 2024-08-15-13-54-53--json-no-lint-gpt-4o-2024-05-13-func-2
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 7.7
|
||||
total_cost: 1.2210
|
||||
- dirname: 2024-08-15-13-56-21--json-no-lint-claude-3.5-sonnet-whole-2
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 16.5
|
||||
total_cost: 1.6556
|
||||
- dirname: 2024-08-15-14-06-12--json-no-lint-deepseek-coder-whole-2
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 1
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 25.8
|
||||
total_cost: 0.0439
|
||||
- dirname: 2024-08-15-14-11-45--json-no-lint-gpt-4o-2024-08-06-whole-3
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 4.3
|
||||
total_cost: 0.7945
|
||||
- dirname: 2024-08-15-14-13-11--json-no-lint-gpt-4o-2024-08-06-func-3
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 56.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 5.6
|
||||
total_cost: 0.8220
|
||||
- dirname: 2024-08-15-14-16-34--json-no-lint-gpt-4o-2024-05-13-func-3
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 58.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 8.7
|
||||
total_cost: 1.2064
|
||||
- dirname: 2024-08-15-14-17-51--json-no-lint-claude-3.5-sonnet-whole-3
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 11.0
|
||||
total_cost: 1.6555
|
||||
- dirname: 2024-08-15-14-21-06--json-no-lint-deepseek-coder-whole-3
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 61.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 3
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 2
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 24.4
|
||||
total_cost: 0.0439
|
||||
- dirname: 2024-08-15-14-27-17--json-no-lint-gpt-4o-2024-08-06-whole-4
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 4.3
|
||||
total_cost: 0.8015
|
||||
- dirname: 2024-08-15-14-28-58--json-no-lint-gpt-4o-2024-08-06-func-4
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.0
|
||||
total_cost: 0.8394
|
||||
- dirname: 2024-08-15-14-32-58--json-no-lint-gpt-4o-2024-05-13-func-4
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 59.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 2
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 11.1
|
||||
total_cost: 1.2120
|
||||
- dirname: 2024-08-15-14-34-39--json-no-lint-claude-3.5-sonnet-whole-4
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 11.3
|
||||
total_cost: 1.6635
|
||||
- dirname: 2024-08-15-14-38-35--json-no-lint-deepseek-coder-whole-4
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 59.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 2
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 24.5
|
||||
total_cost: 0.0438
|
||||
- dirname: 2024-08-15-14-44-11--json-no-lint-gpt-4o-2024-08-06-whole-5
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 4.6
|
||||
total_cost: 0.8023
|
||||
- dirname: 2024-08-15-14-45-40--json-no-lint-gpt-4o-2024-08-06-func-5
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 57.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 3
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.3
|
||||
total_cost: 0.8354
|
||||
- dirname: 2024-08-15-14-49-44--json-no-lint-gpt-4o-2024-05-13-func-5
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: JSON
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 59.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 4
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 10.5
|
||||
total_cost: 1.2099
|
||||
- dirname: 2024-08-15-14-51-18--json-no-lint-claude-3.5-sonnet-whole-5
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 11.4
|
||||
total_cost: 1.6685
|
||||
- dirname: 2024-08-15-14-54-41--json-no-lint-deepseek-coder-whole-5
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: Markdown
|
||||
commit_hash: bac04a2
|
||||
pass_rate_1: 61.7
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 2
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 2
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 24.5
|
||||
total_cost: 0.0439
|
||||
- dirname: 2024-08-15-15-12-55--json-no-lint-strict-gpt-4o-2024-08-06-func-2
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON (strict)
|
||||
commit_hash: bf2d5fe
|
||||
pass_rate_1: 57.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 5.9
|
||||
total_cost: 0.8216
|
||||
- dirname: 2024-08-15-15-14-31--json-no-lint-strict-gpt-4o-2024-08-06-func-3
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON (strict)
|
||||
commit_hash: bf2d5fe
|
||||
pass_rate_1: 54.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 2
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.3
|
||||
total_cost: 0.8410
|
||||
- dirname: 2024-08-15-15-16-14--json-no-lint-strict-gpt-4o-2024-08-06-func-4
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON (strict)
|
||||
commit_hash: bf2d5fe
|
||||
pass_rate_1: 59.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 5.9
|
||||
total_cost: 0.8203
|
||||
- dirname: 2024-08-15-15-17-50--json-no-lint-strict-gpt-4o-2024-08-06-func-5
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-08-06
|
||||
edit_format: JSON (strict)
|
||||
commit_hash: bf2d5fe
|
||||
pass_rate_1: 57.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-08-06
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.1
|
||||
total_cost: 0.8415
|
||||
- dirname: 2024-08-15-17-36-22--json-no-lint-again-gpt-4o-2024-05-13-whole-1
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: Markdown
|
||||
commit_hash: ed94379
|
||||
pass_rate_1: 60.2
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 7
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.8
|
||||
total_cost: 1.5110
|
||||
- dirname: 2024-08-15-17-38-13--json-no-lint-again-gpt-4o-2024-05-13-whole-2
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: Markdown
|
||||
commit_hash: ed94379
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 7.0
|
||||
total_cost: 1.4954
|
||||
- dirname: 2024-08-15-17-40-10--json-no-lint-again-gpt-4o-2024-05-13-whole-3
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: Markdown
|
||||
commit_hash: ed94379
|
||||
pass_rate_1: 60.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 6.8
|
||||
total_cost: 1.4999
|
||||
- dirname: 2024-08-15-17-41-30--json-no-lint-again-gpt-4o-2024-05-13-whole-4
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: Markdown
|
||||
commit_hash: ed94379
|
||||
pass_rate_1: 58.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 7.4
|
||||
total_cost: 1.4848
|
||||
- dirname: 2024-08-15-17-43-12--json-no-lint-again-gpt-4o-2024-05-13-whole-5
|
||||
test_cases: 133
|
||||
model: gpt-4o-2024-05-13
|
||||
edit_format: Markdown
|
||||
commit_hash: ed94379
|
||||
pass_rate_1: 59.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model gpt-4o-2024-05-13
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 7.6
|
||||
total_cost: 1.4948
|
||||
|
||||
- dirname: 2024-08-15-19-35-32--json-no-lint-again-deepseek-coder-func-1
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: JSON
|
||||
commit_hash: 3a2ac02-dirty
|
||||
pass_rate_1: 50.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 2
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 17.8
|
||||
total_cost: 0.0330
|
||||
- dirname: 2024-08-15-19-37-50--json-no-lint-again-deepseek-coder-func-2
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 49.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 5
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 18.3
|
||||
total_cost: 0.0336
|
||||
- dirname: 2024-08-15-19-40-20--json-no-lint-again-deepseek-coder-func-3
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 48.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 5
|
||||
indentation_errors: 1
|
||||
exhausted_context_windows: 1
|
||||
test_timeouts: 2
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 18.4
|
||||
total_cost: 0.0337
|
||||
- dirname: 2024-08-15-19-44-07--json-no-lint-again-deepseek-coder-func-4
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 53.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 2
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 17.6
|
||||
total_cost: 0.0330
|
||||
- dirname: 2024-08-15-19-46-48--json-no-lint-again-deepseek-coder-func-5
|
||||
test_cases: 133
|
||||
model: deepseek-coder V2 0724
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28-dirty
|
||||
pass_rate_1: 53.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 11
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 2
|
||||
command: aider --model deepseek-coder
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 18.0
|
||||
total_cost: 0.0332
|
||||
|
||||
- dirname: 2024-08-15-20-07-59--json-no-lint-again-claude-3.5-sonnet-func-1
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 54.1
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 9.5
|
||||
total_cost: 1.5789
|
||||
- dirname: 2024-08-15-20-09-39--json-no-lint-again-claude-3.5-sonnet-func-2
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 55.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 9.2
|
||||
total_cost: 1.5916
|
||||
- dirname: 2024-08-15-20-11-39--json-no-lint-again-claude-3.5-sonnet-func-3
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 53.4
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 10.3
|
||||
total_cost: 1.5896
|
||||
- dirname: 2024-08-15-20-13-44--json-no-lint-again-claude-3.5-sonnet-func-4
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 55.6
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 9.2
|
||||
total_cost: 1.6000
|
||||
- dirname: 2024-08-15-20-15-51--json-no-lint-again-claude-3.5-sonnet-func-5
|
||||
test_cases: 133
|
||||
model: claude-3.5-sonnet
|
||||
edit_format: JSON
|
||||
commit_hash: 1a98c28
|
||||
pass_rate_1: 51.9
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 1
|
||||
command: aider --model claude-3.5-sonnet
|
||||
date: 2024-08-15
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 8.9
|
||||
total_cost: 1.5936
|
||||
|
|
@ -577,6 +577,7 @@
|
|||
pass_rate_2: 77.4
|
||||
percent_cases_well_formed: 99.2
|
||||
error_outputs: 23
|
||||
released: 2024-06-20
|
||||
num_malformed_responses: 4
|
||||
num_with_malformed_responses: 1
|
||||
user_asks: 2
|
||||
|
@ -603,6 +604,7 @@
|
|||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
released: 2024-03-13
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
|
@ -644,6 +646,7 @@
|
|||
commit_hash: d31eef3-dirty
|
||||
pass_rate_1: 40.6
|
||||
pass_rate_2: 55.6
|
||||
released: 2024-07-18
|
||||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 1
|
||||
num_malformed_responses: 0
|
||||
|
@ -668,6 +671,7 @@
|
|||
pass_rate_1: 60.9
|
||||
pass_rate_2: 69.9
|
||||
percent_cases_well_formed: 97.7
|
||||
released: 2024-06-28
|
||||
error_outputs: 58
|
||||
num_malformed_responses: 13
|
||||
num_with_malformed_responses: 3
|
||||
|
@ -690,6 +694,7 @@
|
|||
commit_hash: f7ce78b-dirty
|
||||
pass_rate_1: 46.6
|
||||
pass_rate_2: 63.9
|
||||
released: 2024-07-23
|
||||
percent_cases_well_formed: 92.5
|
||||
error_outputs: 84
|
||||
num_malformed_responses: 19
|
||||
|
@ -716,6 +721,7 @@
|
|||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 0
|
||||
num_malformed_responses: 0
|
||||
released: 2024-07-23
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
lazy_comments: 0
|
||||
|
@ -738,6 +744,7 @@
|
|||
pass_rate_2: 72.9
|
||||
percent_cases_well_formed: 97.7
|
||||
error_outputs: 13
|
||||
released: 2024-07-24
|
||||
num_malformed_responses: 3
|
||||
num_with_malformed_responses: 3
|
||||
user_asks: 1
|
||||
|
@ -763,6 +770,7 @@
|
|||
error_outputs: 3
|
||||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
released: 2024-07-24
|
||||
user_asks: 3
|
||||
lazy_comments: 0
|
||||
syntax_errors: 1
|
||||
|
@ -785,6 +793,7 @@
|
|||
percent_cases_well_formed: 100.0
|
||||
error_outputs: 27
|
||||
num_malformed_responses: 0
|
||||
released: 2024-07-23
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 23
|
||||
lazy_comments: 8
|
||||
|
@ -810,6 +819,7 @@
|
|||
num_malformed_responses: 0
|
||||
num_with_malformed_responses: 0
|
||||
user_asks: 0
|
||||
released: 2024-07-23
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
|
@ -838,9 +848,34 @@
|
|||
indentation_errors: 2
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 5
|
||||
released: 2024-08-06
|
||||
command: aider --model openai/gpt-4o-2024-08-06
|
||||
date: 2024-08-06
|
||||
versions: 0.48.1-dev
|
||||
seconds_per_case: 6.5
|
||||
total_cost: 0.0000
|
||||
|
||||
- dirname: 2024-08-14-13-07-12--chatgpt-4o-latest-diff
|
||||
test_cases: 133
|
||||
model: chatgpt-4o-latest
|
||||
edit_format: diff
|
||||
commit_hash: b1c3769
|
||||
pass_rate_1: 53.4
|
||||
pass_rate_2: 69.2
|
||||
percent_cases_well_formed: 97.7
|
||||
error_outputs: 27
|
||||
num_malformed_responses: 5
|
||||
num_with_malformed_responses: 3
|
||||
user_asks: 7
|
||||
lazy_comments: 0
|
||||
syntax_errors: 0
|
||||
indentation_errors: 0
|
||||
exhausted_context_windows: 0
|
||||
test_timeouts: 0
|
||||
command: aider --model openai/chatgpt-4o-latest
|
||||
date: 2024-08-14
|
||||
released: 2024-08-08
|
||||
versions: 0.50.2-dev
|
||||
seconds_per_case: 26.3
|
||||
total_cost: 3.6113
|
||||
|
|
@ -1,90 +1,126 @@
|
|||
<canvas id="blameChart" width="800" height="450" style="margin-top: 20px"></canvas>
|
||||
<canvas id="blameChart" width="800" height="360" style="margin-top: 20px"></canvas>
|
||||
<canvas id="linesChart" width="800" height="360" style="margin-top: 20px"></canvas>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/moment"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chartjs-adapter-moment"></script>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
var ctx = document.getElementById('blameChart').getContext('2d');
|
||||
var blameCtx = document.getElementById('blameChart').getContext('2d');
|
||||
var linesCtx = document.getElementById('linesChart').getContext('2d');
|
||||
|
||||
var labels = [{% for row in site.data.blame %}'{{ row.end_tag }}',{% endfor %}];
|
||||
|
||||
var blameData = {
|
||||
labels: labels,
|
||||
datasets: [{
|
||||
label: 'Aider\'s Contribution to Each Release',
|
||||
data: [
|
||||
{% for row in site.data.blame %}
|
||||
{
|
||||
x: '{{ row.end_date }}',
|
||||
y: {{ row.aider_percentage }},
|
||||
r: Math.sqrt({{ row.aider_total }}) * 1.5,
|
||||
label: '{{ row.end_tag }}',
|
||||
percentage: {{ row.aider_percentage }},
|
||||
lines: {{ row.aider_total }}
|
||||
},
|
||||
{% endfor %}
|
||||
],
|
||||
backgroundColor: 'rgba(54, 162, 235, 0.2)',
|
||||
label: 'Aider\'s percent of new code by release',
|
||||
data: [{% for row in site.data.blame %}{ x: '{{ row.end_tag }}', y: {{ row.aider_percentage }}, lines: {{ row.aider_total }} },{% endfor %}],
|
||||
backgroundColor: 'rgba(54, 162, 235, 0.8)',
|
||||
borderColor: 'rgba(54, 162, 235, 1)',
|
||||
borderWidth: 1
|
||||
}]
|
||||
};
|
||||
|
||||
var blameChart = new Chart(ctx, {
|
||||
type: 'bubble',
|
||||
var linesData = {
|
||||
labels: labels,
|
||||
datasets: [{
|
||||
label: 'Aider\'s lines of new code',
|
||||
data: [{% for row in site.data.blame %}{ x: '{{ row.end_tag }}', y: {{ row.aider_total }} },{% endfor %}],
|
||||
backgroundColor: 'rgba(255, 99, 132, 0.8)',
|
||||
borderColor: 'rgba(255, 99, 132, 1)',
|
||||
borderWidth: 1
|
||||
}]
|
||||
};
|
||||
|
||||
var blameChart = new Chart(blameCtx, {
|
||||
type: 'bar',
|
||||
data: blameData,
|
||||
options: {
|
||||
scales: {
|
||||
x: {
|
||||
type: 'time',
|
||||
time: {
|
||||
unit: 'month',
|
||||
displayFormats: {
|
||||
month: 'MMM YYYY'
|
||||
}
|
||||
},
|
||||
type: 'category',
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Release date'
|
||||
text: 'Version'
|
||||
},
|
||||
ticks: {
|
||||
maxRotation: 45,
|
||||
minRotation: 45
|
||||
},
|
||||
min: moment('{{ site.data.blame | first | map: "end_date" | first }}').subtract(1, 'month'),
|
||||
max: moment('{{ site.data.blame | last | map: "end_date" | first }}').add(1, 'month')
|
||||
}
|
||||
},
|
||||
y: {
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Aider Contribution (% of code)'
|
||||
text: 'Percent of new code'
|
||||
},
|
||||
beginAtZero: true
|
||||
}
|
||||
},
|
||||
plugins: {
|
||||
legend: {
|
||||
display: false
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function(context) {
|
||||
return `${context.raw.label}: ${Math.round(context.raw.percentage)}% (${context.raw.lines} lines)`;
|
||||
}
|
||||
}
|
||||
},
|
||||
legend: {
|
||||
display: true,
|
||||
position: 'top',
|
||||
labels: {
|
||||
generateLabels: function(chart) {
|
||||
return [{
|
||||
text: 'Y-axis is percent of code, bubble size is lines of code',
|
||||
fillStyle: 'rgba(54, 162, 235, 0.2)',
|
||||
strokeStyle: 'rgba(54, 162, 235, 1)',
|
||||
lineWidth: 1,
|
||||
hidden: false,
|
||||
index: 0
|
||||
}];
|
||||
var label = 'Aider\'s contribution';
|
||||
var value = context.parsed.y || 0;
|
||||
var lines = context.raw.lines || 0;
|
||||
return `${label}: ${Math.round(value)}% (${lines} lines)`;
|
||||
}
|
||||
}
|
||||
},
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Aider\'s Contribution to Each Release',
|
||||
text: 'Percent of new code written by aider, by release',
|
||||
font: {
|
||||
size: 16
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
var linesChart = new Chart(linesCtx, {
|
||||
type: 'bar',
|
||||
data: linesData,
|
||||
options: {
|
||||
scales: {
|
||||
x: {
|
||||
type: 'category',
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Version'
|
||||
},
|
||||
ticks: {
|
||||
maxRotation: 45,
|
||||
minRotation: 45
|
||||
}
|
||||
},
|
||||
y: {
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Lines of new code'
|
||||
},
|
||||
beginAtZero: true
|
||||
}
|
||||
},
|
||||
plugins: {
|
||||
legend: {
|
||||
display: false
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function(context) {
|
||||
var label = 'New lines of code by aider';
|
||||
var value = context.parsed.y || 0;
|
||||
return `${label}: ${value}`;
|
||||
}
|
||||
}
|
||||
},
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Lines of new code written by aider, by release',
|
||||
font: {
|
||||
size: 16
|
||||
}
|
||||
|
|
165
aider/website/_includes/code-in-json-benchmark.js
Normal file
165
aider/website/_includes/code-in-json-benchmark.js
Normal file
|
@ -0,0 +1,165 @@
|
|||
<style>
|
||||
.chart-container {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="passRateChart"></canvas>
|
||||
</div>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
var ctx = document.getElementById('passRateChart').getContext('2d');
|
||||
var chartContainer = document.querySelector('.chart-container');
|
||||
|
||||
var yamlData = {{ site.data.code-in-json | jsonify }};
|
||||
|
||||
var models = [...new Set(yamlData.map(item => item.model))].sort();
|
||||
var editFormats = [...new Set(yamlData.map(item => item.edit_format))];
|
||||
|
||||
var datasets = editFormats.map(format => ({
|
||||
label: format,
|
||||
data: models.map(model => {
|
||||
var items = yamlData.filter(d => d.model === model && d.edit_format === format);
|
||||
if (items.length === 0) return null;
|
||||
var average = items.reduce((sum, item) => sum + item.pass_rate_1, 0) / items.length;
|
||||
return parseFloat(average.toFixed(1));
|
||||
}),
|
||||
backgroundColor: function(context) {
|
||||
const format = context.dataset.label;
|
||||
if (format === 'Markdown') {
|
||||
return 'rgba(54, 162, 235, 0.8)';
|
||||
} else if (format.startsWith('JSON')) {
|
||||
const ctx = context.chart.ctx;
|
||||
const gradient = ctx.createPattern(createStripedCanvas(format === 'JSON (strict)'), 'repeat');
|
||||
return gradient;
|
||||
} else {
|
||||
return 'rgba(75, 192, 192, 0.8)';
|
||||
}
|
||||
},
|
||||
}));
|
||||
|
||||
var data = {
|
||||
labels: models,
|
||||
datasets: datasets
|
||||
};
|
||||
|
||||
function getAspectRatio() {
|
||||
var width = chartContainer.offsetWidth;
|
||||
// Gradually change aspect ratio from 2 (landscape) to 1 (square)
|
||||
return Math.max(1, Math.min(2, width / 300));
|
||||
}
|
||||
|
||||
var config = {
|
||||
type: 'bar',
|
||||
data: data,
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: true,
|
||||
aspectRatio: getAspectRatio(),
|
||||
scales: {
|
||||
x: {
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Model'
|
||||
}
|
||||
},
|
||||
y: {
|
||||
beginAtZero: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Pass Rate (%, average of 5 runs)'
|
||||
},
|
||||
max: 70
|
||||
}
|
||||
},
|
||||
plugins: {
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Coding skill by model and code wrapping strategy',
|
||||
font: {
|
||||
size: 16
|
||||
}
|
||||
},
|
||||
legend: {
|
||||
position: 'top',
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function(context) {
|
||||
let label = context.dataset.label || '';
|
||||
if (label) {
|
||||
label += ': ';
|
||||
}
|
||||
if (context.parsed.y !== null) {
|
||||
label += context.parsed.y.toFixed(1) + '%';
|
||||
}
|
||||
return label;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
plugins: [{
|
||||
afterDraw: function(chart) {
|
||||
var ctx = chart.ctx;
|
||||
var isWideScreen = window.innerWidth > 768; // Assuming 768px as the breakpoint for wide screens
|
||||
if (isWideScreen) {
|
||||
chart.data.datasets.forEach(function(dataset, i) {
|
||||
var meta = chart.getDatasetMeta(i);
|
||||
meta.data.forEach(function(bar, index) {
|
||||
var data = dataset.data[index];
|
||||
if (data !== null) {
|
||||
ctx.fillStyle = '#000000';
|
||||
ctx.textAlign = 'center';
|
||||
ctx.textBaseline = 'bottom';
|
||||
var displayText = data.toFixed(1) + '%';
|
||||
ctx.fillText(displayText, bar.x, bar.y - 5);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
}]
|
||||
};
|
||||
|
||||
var chart = new Chart(ctx, config);
|
||||
|
||||
function resizeChart() {
|
||||
chart.options.aspectRatio = getAspectRatio();
|
||||
chart.resize();
|
||||
}
|
||||
|
||||
window.addEventListener('resize', resizeChart);
|
||||
|
||||
// Initial resize to set correct size
|
||||
resizeChart();
|
||||
});
|
||||
|
||||
function createStripedCanvas(isStrict) {
|
||||
const patternCanvas = document.createElement('canvas');
|
||||
const patternContext = patternCanvas.getContext('2d');
|
||||
const size = 10;
|
||||
patternCanvas.width = size;
|
||||
patternCanvas.height = size;
|
||||
|
||||
patternContext.fillStyle = 'rgba(255, 99, 132, 0.8)';
|
||||
patternContext.fillRect(0, 0, size, size);
|
||||
|
||||
if (isStrict) {
|
||||
patternContext.strokeStyle = 'rgba(255, 255, 255, 0.8)';
|
||||
patternContext.lineWidth = 0.75;
|
||||
patternContext.beginPath();
|
||||
patternContext.moveTo(0, 0);
|
||||
patternContext.lineTo(size, size);
|
||||
patternContext.stroke();
|
||||
}
|
||||
|
||||
return patternCanvas;
|
||||
}
|
||||
</script>
|
139
aider/website/_includes/code-in-json-syntax.js
Normal file
139
aider/website/_includes/code-in-json-syntax.js
Normal file
|
@ -0,0 +1,139 @@
|
|||
<style>
|
||||
.chart-container {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="syntaxErrorsChart"></canvas>
|
||||
</div>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
var ctx = document.getElementById('syntaxErrorsChart').getContext('2d');
|
||||
var chartContainer = document.querySelector('.chart-container');
|
||||
|
||||
var yamlData = {{ site.data.code-in-json | jsonify }};
|
||||
|
||||
var models = [...new Set(yamlData.map(item => item.model))].sort();
|
||||
var editFormats = [...new Set(yamlData.map(item => item.edit_format))];
|
||||
|
||||
var datasets = editFormats.map(format => ({
|
||||
label: format,
|
||||
data: models.map(model => {
|
||||
var items = yamlData.filter(d => d.model === model && d.edit_format === format);
|
||||
if (items.length === 0) return null;
|
||||
var totalErrors = items.reduce((sum, item) => sum + item.syntax_errors + item.indentation_errors, 0);
|
||||
return totalErrors;
|
||||
}),
|
||||
backgroundColor: function(context) {
|
||||
const format = context.dataset.label;
|
||||
if (format === 'Markdown') {
|
||||
return 'rgba(54, 162, 235, 0.8)';
|
||||
} else if (format.startsWith('JSON')) {
|
||||
const ctx = context.chart.ctx;
|
||||
const gradient = ctx.createPattern(createStripedCanvas(format === 'JSON (strict)'), 'repeat');
|
||||
return gradient;
|
||||
} else {
|
||||
return 'rgba(75, 192, 192, 0.8)';
|
||||
}
|
||||
},
|
||||
}));
|
||||
|
||||
var data = {
|
||||
labels: models,
|
||||
datasets: datasets
|
||||
};
|
||||
|
||||
function getAspectRatio() {
|
||||
var width = chartContainer.offsetWidth;
|
||||
// Gradually change aspect ratio from 2 (landscape) to 1 (square)
|
||||
return Math.max(1, Math.min(2, width / 300));
|
||||
}
|
||||
|
||||
var config = {
|
||||
type: 'bar',
|
||||
data: data,
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: true,
|
||||
aspectRatio: getAspectRatio(),
|
||||
scales: {
|
||||
x: {
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Model'
|
||||
}
|
||||
},
|
||||
y: {
|
||||
beginAtZero: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Total syntax errors from 5 runs'
|
||||
},
|
||||
max: 35
|
||||
}
|
||||
},
|
||||
plugins: {
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Syntax errors by model and code wrapping strategy',
|
||||
font: {
|
||||
size: 16
|
||||
}
|
||||
},
|
||||
legend: {
|
||||
position: 'top',
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function(context) {
|
||||
let label = context.dataset.label || '';
|
||||
if (label) {
|
||||
label += ': ';
|
||||
}
|
||||
if (context.parsed.y !== null) {
|
||||
label += context.parsed.y;
|
||||
}
|
||||
return label;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
plugins: [{
|
||||
afterDraw: function(chart) {
|
||||
var ctx = chart.ctx;
|
||||
chart.data.datasets.forEach(function(dataset, i) {
|
||||
var meta = chart.getDatasetMeta(i);
|
||||
meta.data.forEach(function(bar, index) {
|
||||
var data = dataset.data[index];
|
||||
if (data !== null) {
|
||||
ctx.fillStyle = '#000000';
|
||||
ctx.textAlign = 'center';
|
||||
ctx.textBaseline = 'bottom';
|
||||
ctx.fillText(data, bar.x, bar.y - 5);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}]
|
||||
};
|
||||
|
||||
var chart = new Chart(ctx, config);
|
||||
|
||||
function resizeChart() {
|
||||
chart.options.aspectRatio = getAspectRatio();
|
||||
chart.resize();
|
||||
}
|
||||
|
||||
window.addEventListener('resize', resizeChart);
|
||||
|
||||
// Initial resize to set correct size
|
||||
resizeChart();
|
||||
});
|
||||
</script>
|
248
aider/website/_posts/2024-08-14-code-in-json.md
Normal file
248
aider/website/_posts/2024-08-14-code-in-json.md
Normal file
|
@ -0,0 +1,248 @@
|
|||
---
|
||||
title: LLMs are bad at returning code in JSON
|
||||
excerpt: LLMs write worse code if you ask them to return the code wrapped in JSON via a tool function call.
|
||||
highlight_image: /assets/code-in-json.jpg
|
||||
nav_exclude: true
|
||||
---
|
||||
{% if page.date %}
|
||||
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
|
||||
{% endif %}
|
||||
|
||||
# LLMs are bad at returning code in JSON
|
||||
|
||||
|
||||
LLMs produce lower quality code if they’re asked to return it as part of a structured JSON response. This seems to be true for many top models, including those with specialized support for JSON. Benchmarks show that models struggle with syntactic issues related to quoting and escaping.
|
||||
The benchmark results also imply a decreased capacity for solving coding problems due to the burden of JSON formatting.
|
||||
|
||||
{% include code-in-json-benchmark.js %}
|
||||
|
||||
> Figure 1: Aider coding benchmark scores of models using either plain markdown text or JSON to return code.
|
||||
> Pass rate (%) averaged over 5 runs.
|
||||
> Models produce better code when they return it as markdown text,
|
||||
> as compared to returning code in a structured JSON response.
|
||||
|
||||
|
||||
## Background
|
||||
|
||||
People often ask why aider uses a plain text format for LLMs to specify code edits (below),
|
||||
rather than relying on LLM tools and structured JSON responses.
|
||||
|
||||
```python
|
||||
greeting.py
|
||||
<<<<<<< SEARCH
|
||||
def greeting():
|
||||
print("Hello")
|
||||
=======
|
||||
def greeting():
|
||||
print("Goodbye")
|
||||
>>>>>>> REPLACE
|
||||
```
|
||||
|
||||
People expect that it would be easier and more reliable to use tool calls,
|
||||
which would involve a structured JSON response more like this:
|
||||
|
||||
```json
|
||||
{
|
||||
"filename": "greeting.py",
|
||||
"search": "def greeting():\n print(\"Hello\")\n"
|
||||
"replace": "def greeting():\n print(\"Goodbye\")\n"
|
||||
}
|
||||
```
|
||||
|
||||
This question becomes increasingly relevant as LLM providers
|
||||
continue to improve their tooling for reliably generating JSON.
|
||||
For example,
|
||||
[OpenAI recently announced](https://openai.com/index/introducing-structured-outputs-in-the-api/)
|
||||
the ability to
|
||||
strictly enforce that JSON responses will be syntactically correct
|
||||
and conform to a specified schema.
|
||||
|
||||
But just producing valid JSON is not sufficient for AI code generation --
|
||||
the code inside the JSON matters too.
|
||||
It has to be high quality code that solves the assigned coding task without errors or bugs.
|
||||
Unfortunately,
|
||||
LLMs write worse code when they're asked to
|
||||
wrap it in JSON.
|
||||
|
||||
In some sense this shouldn't be surprising.
|
||||
Just look at the very simple
|
||||
JSON example above, with the escaped
|
||||
quotes `\"` and
|
||||
newlines `\n`
|
||||
mixed into the code.
|
||||
Imagine the additional
|
||||
complexity
|
||||
if the code itself contained quoted strings
|
||||
with their
|
||||
own escape sequences.
|
||||
|
||||
Would *you* write better code by
|
||||
typing it out normally
|
||||
or typing it as a properly escaped
|
||||
JSON string?
|
||||
|
||||
|
||||
## Quantifying the benefits of plain text
|
||||
|
||||
Previous [aider benchmark results](/2023/07/02/benchmarks.html)
|
||||
showed
|
||||
the superiority of returning code
|
||||
as plain text compared to JSON-wrapped function calls.
|
||||
Those results were obtained
|
||||
over a year ago, against models far less capable than those available today.
|
||||
OpenAI's newly announced support for "strict" JSON
|
||||
suggests the possibility that modern models might be able
|
||||
to return quality code inside a structured JSON response.
|
||||
|
||||
The results presented here are based on
|
||||
the
|
||||
[aider "code editing" benchmark](/2023/07/02/benchmarks.html#the-benchmark)
|
||||
of 133 practice exercises from the Exercism python repository.
|
||||
The benchmark was simplified somewhat to focus on the differences between
|
||||
plain text and JSON responses.
|
||||
In particular, models were
|
||||
restricted to a single attempt to solve each task
|
||||
without a second try to fix errors.
|
||||
|
||||
The performance of each model was compared across different strategies for returning code:
|
||||
|
||||
- **Markdown** -- the model returned the whole source code file in standard markdown triple-backtick fences.
|
||||
- **JSON** -- the model used a tool function call to return the whole source code file in a structured JSON response.
|
||||
- **JSON (strict)** -- the same as the "JSON" strategy, but with `strict=True`. Only gpt-4o-2024-08-06 supported this setting.
|
||||
|
||||
The markdown strategy was the same as
|
||||
aider's "whole" edit format, where the
|
||||
LLM returns an entire updated copy of the source file like this:
|
||||
|
||||
````
|
||||
Here is the program you asked for which prints "Hello":
|
||||
|
||||
greeting.py
|
||||
```
|
||||
def greeting():
|
||||
print("Hello")
|
||||
```
|
||||
````
|
||||
|
||||
Both JSON strategies required the LLM to call the `write_file` function with
|
||||
an explanation/plan and
|
||||
the entire updated copy of the source file.
|
||||
The LLM didn't have to specify the filename,
|
||||
since the benchmark operates on one source file at a time.
|
||||
|
||||
```json
|
||||
{
|
||||
"explanation": "Here is the program you asked for which prints \"Hello\"",
|
||||
"content": "def greeting():\n print(\"Hello\")\n"
|
||||
}
|
||||
```
|
||||
|
||||
This experimental setup was designed to quantify
|
||||
the effects of JSON-wrapping on the LLMs ability to write code to solve a task.
|
||||
|
||||
## Results
|
||||
|
||||
Four of the strongest code editing models were benchmarked
|
||||
to assess the impact of JSON-wrapping code:
|
||||
|
||||
- claude-3-5-sonnet-20240620
|
||||
- deepseek-coder (V2 0724)
|
||||
- gpt-4o-2024-05-13
|
||||
- gpt-4o-2024-08-06
|
||||
|
||||
Each combination of model and code wrapping strategy was benchmarked 5 times.
|
||||
|
||||
### Overall coding skill
|
||||
|
||||
As shown in Figure 1,
|
||||
all of the models did worse on the benchmark when asked to
|
||||
return code in a structured JSON response.
|
||||
Most did significantly worse, performing well below
|
||||
their result with the markdown strategy.
|
||||
|
||||
Some noteworthy observations:
|
||||
|
||||
- OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were
|
||||
close. Using JSON only dropped the score by 0.4 percent, a difference which is
|
||||
within the margin of error for 5 trials.
|
||||
- The use of OpenAI's new strict mode offered no improvement
|
||||
as compared to non-strict JSON.
|
||||
Both JSON results were well below the markdown result.
|
||||
- The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping.
|
||||
|
||||
### Syntax errors
|
||||
|
||||
Models tend to make more syntax errors when asked to wrap code in JSON.
|
||||
Figure 2 shows the number of syntax errors found in the code produced by each
|
||||
model and code wrapping strategy.
|
||||
It totals up the `SyntaxError` and `IndentationError` errors from all 5 runs,
|
||||
for each model and strategy combination.
|
||||
|
||||
Below is an example of a `SyntaxError` created by gpt-4o-2024-05-13 using the
|
||||
JSON code wrapping strategy.
|
||||
It appears that the model got confused about escaping and quoting while trying
|
||||
to format the JSON response.
|
||||
|
||||
```python
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
File "bottle-song/bottle_song.py", line 9
|
||||
lyrics.append(f'There'll be {i - 1} green bottles hanging on the wall.')
|
||||
^
|
||||
SyntaxError: unterminated string literal (detected at line 9)
|
||||
```
|
||||
|
||||
The problematic line of code contains a single-quoted string which also
|
||||
contains a single-quote character.
|
||||
It should have been output as the following chunk of JSON, with
|
||||
a double slash in `There\\'ll`.
|
||||
That is needed to JSON-escape the `\` so that it survives
|
||||
JSON-decoding to
|
||||
produce `There\'ll` in the resulting code.
|
||||
That would correctly escape the single-quote inside the single-quoted string.
|
||||
|
||||
```
|
||||
...lyrics.append(f'There\\'ll be {i - 1} green bottles hanging on the wall.')\n...
|
||||
```
|
||||
|
||||
|
||||
|
||||
{% include code-in-json-syntax.js %}
|
||||
|
||||
> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code,
|
||||
> totaled from 5 runs.
|
||||
> Models tend to make more syntax and formatting errors when asked to wrap code in JSON.
|
||||
|
||||
### Beyond syntax errors
|
||||
|
||||
Sonnet's results seems to indicate that the negative effects of JSON-wrapping
|
||||
go beyond just syntactic difficulties.
|
||||
Sonnet avoided syntax errors regardless of the code wrapping strategy,
|
||||
but its benchmark scores in Figure 1 were nonetheless lower with JSON.
|
||||
This implies that JSON-wrapping may distract or challenge models in a way that
|
||||
reduces their ability to reason about solving coding problems.
|
||||
|
||||
|
||||
|
||||
## Conclusions
|
||||
|
||||
While the specific results differ from the similar
|
||||
[July 2023 experiments](/2023/07/02/benchmarks.html),
|
||||
the conclusion remains unchanged: LLMs are bad at returning code in
|
||||
structured JSON responses.
|
||||
|
||||
OpenAI appears to be making progress in allowing LLMs to
|
||||
return JSON-wrapped code
|
||||
without harming the code quality.
|
||||
But it seems premature to consider switching from plain text
|
||||
to JSON-wrapped code at this time.
|
||||
|
||||
---------
|
||||
|
||||
#### Notes on the aider leaderboard
|
||||
|
||||
*The results presented here are not directly comparable to results
|
||||
from the main
|
||||
[aider LLM leaderboard](https://aider.chat/docs/leaderboards/).
|
||||
A number of settings were changed to simplify the benchmark
|
||||
in order to focus on comparing plain text and JSON-wrapped code.*
|
Binary file not shown.
Before Width: | Height: | Size: 64 KiB After Width: | Height: | Size: 158 KiB |
BIN
aider/website/assets/code-in-json.jpg
Normal file
BIN
aider/website/assets/code-in-json.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 158 KiB |
File diff suppressed because it is too large
Load diff
Before Width: | Height: | Size: 53 KiB After Width: | Height: | Size: 74 KiB |
|
@ -28,7 +28,7 @@ Using a `.aider.conf.yml` file:
|
|||
dark-mode: true
|
||||
```
|
||||
|
||||
By setting an environgment variable:
|
||||
By setting an environment variable:
|
||||
|
||||
```
|
||||
export AIDER_DARK_MODE=true
|
||||
|
|
|
@ -27,7 +27,7 @@ The json file should be a dictionary with an entry for each model, as follows:
|
|||
|
||||
```
|
||||
{
|
||||
"deepseek-chat": {
|
||||
"deepseek/deepseek-chat": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 32000,
|
||||
"max_output_tokens": 4096,
|
||||
|
@ -42,6 +42,11 @@ The json file should be a dictionary with an entry for each model, as follows:
|
|||
See
|
||||
[litellm's model_prices_and_context_window.json file](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) for more examples.
|
||||
|
||||
{: .tip }
|
||||
Use a fully qualified model name with a `provider/` at the front
|
||||
in the `.aider.model.metadata.json` file.
|
||||
For example, use `deepseek/deepseek-chat`, not just `deepseek-chat`.
|
||||
|
||||
## Model settings
|
||||
|
||||
Aider has a number of settings that control how it works with
|
||||
|
|
|
@ -321,6 +321,6 @@ mod_dates = [get_last_modified_date(file) for file in files]
|
|||
latest_mod_date = max(mod_dates)
|
||||
cog.out(f"{latest_mod_date.strftime('%B %d, %Y.')}")
|
||||
]]]-->
|
||||
August 10, 2024.
|
||||
August 14, 2024.
|
||||
<!--[[[end]]]-->
|
||||
</p>
|
||||
|
|
|
@ -23,7 +23,7 @@ You can add images to the chat just like you would
|
|||
add any other file:
|
||||
|
||||
- Use `/add <image-filename>` from within the chat
|
||||
- Use `/add-clipboard-image` to paste an image from your clipboard into the chat.
|
||||
- Use `/clipboard` to paste an image from your clipboard into the chat.
|
||||
- Launch aider with image filenames on the command line: `aider <image-filename>` along with any other command line arguments you need.
|
||||
|
||||
## Web pages
|
||||
|
|
|
@ -28,8 +28,6 @@ from aider.coders import Coder
|
|||
from aider.dump import dump # noqa: F401
|
||||
from aider.io import InputOutput
|
||||
|
||||
load_dotenv()
|
||||
|
||||
BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks"))
|
||||
|
||||
EXERCISES_DIR_DEFAULT = "exercism-python"
|
||||
|
@ -39,6 +37,8 @@ app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
|
|||
|
||||
NUM_TESTS = (89, 133)
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
def show_stats(dirnames, graphs):
|
||||
raw_rows = []
|
||||
|
@ -378,7 +378,7 @@ def summarize_results(dirname):
|
|||
pass_rate = 100 * passed_tests[i] / res.completed_tests
|
||||
percents[i] = pass_rate
|
||||
# console.print(f"{pass_rate:.1f}% correct after try {i+1}")
|
||||
setattr(res, f"pass_rate_{i+1}", f"{pass_rate:.1f}")
|
||||
setattr(res, f"pass_rate_{i + 1}", f"{pass_rate:.1f}")
|
||||
|
||||
print(f"- dirname: {dirname.name}")
|
||||
style = None if res.completed_tests in NUM_TESTS else "red"
|
||||
|
@ -393,10 +393,10 @@ def summarize_results(dirname):
|
|||
console.print(f" {key}: {val}", style=style)
|
||||
|
||||
for i in range(tries):
|
||||
print(f" pass_rate_{i+1}: {percents[i]:.1f}")
|
||||
print(f" pass_rate_{i + 1}: {percents[i]:.1f}")
|
||||
|
||||
pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
|
||||
print(f" percent_cases_well_formed: {pct_well_formed*100:.1f}")
|
||||
print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}")
|
||||
|
||||
show("error_outputs")
|
||||
show("num_malformed_responses")
|
||||
|
@ -564,7 +564,6 @@ def run_test_real(
|
|||
fnames=fnames,
|
||||
use_git=False,
|
||||
stream=False,
|
||||
pretty=False,
|
||||
verbose=verbose,
|
||||
)
|
||||
coder.max_apply_update_errors = max_apply_update_errors
|
||||
|
@ -591,7 +590,7 @@ def run_test_real(
|
|||
|
||||
coder.apply_updates()
|
||||
else:
|
||||
response = coder.run(with_message=instructions)
|
||||
response = coder.run(with_message=instructions, preproc=False)
|
||||
dur += time.time() - start
|
||||
|
||||
if not no_aider:
|
||||
|
|
|
@ -3,6 +3,26 @@ import yaml
|
|||
from imgcat import imgcat
|
||||
from matplotlib import rc
|
||||
|
||||
from aider.dump import dump # noqa: 401
|
||||
|
||||
|
||||
def get_model_color(model):
|
||||
default = "lightblue"
|
||||
|
||||
if model == "gpt-4o-mini":
|
||||
return default
|
||||
|
||||
if "-4o" in model:
|
||||
return "purple"
|
||||
|
||||
if "gpt-4" in model:
|
||||
return "red"
|
||||
|
||||
if "gpt-3.5" in model:
|
||||
return "green"
|
||||
|
||||
return default
|
||||
|
||||
|
||||
def plot_over_time(yaml_file):
|
||||
with open(yaml_file, "r") as file:
|
||||
|
@ -12,49 +32,90 @@ def plot_over_time(yaml_file):
|
|||
pass_rates = []
|
||||
models = []
|
||||
|
||||
print("Debug: Raw data from YAML file:")
|
||||
print(data)
|
||||
|
||||
for entry in data:
|
||||
if "released" in entry and "pass_rate_2" in entry:
|
||||
dates.append(entry["released"])
|
||||
pass_rates.append(entry["pass_rate_2"])
|
||||
models.append(entry["model"].split("(")[0].strip())
|
||||
|
||||
print("Debug: Processed data:")
|
||||
print("Dates:", dates)
|
||||
print("Pass rates:", pass_rates)
|
||||
print("Models:", models)
|
||||
|
||||
if not dates or not pass_rates:
|
||||
print(
|
||||
"Error: No data to plot. Check if the YAML file is empty or if the data is in the"
|
||||
" expected format."
|
||||
)
|
||||
return
|
||||
|
||||
plt.rcParams["hatch.linewidth"] = 0.5
|
||||
plt.rcParams["hatch.color"] = "#444444"
|
||||
|
||||
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
||||
plt.rcParams["text.color"] = "#444444"
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 5))
|
||||
fig, ax = plt.subplots(figsize=(12, 6)) # Increase figure size for better visibility
|
||||
|
||||
print("Debug: Figure created. Plotting data...")
|
||||
ax.grid(axis="y", zorder=0, lw=0.2)
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor("#DDDDDD")
|
||||
spine.set_linewidth(0.5)
|
||||
|
||||
colors = [
|
||||
"red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
|
||||
]
|
||||
colors = [get_model_color(model) for model in models]
|
||||
|
||||
# Separate data points by color
|
||||
purple_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "purple"]
|
||||
red_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "red"]
|
||||
green_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "green"]
|
||||
|
||||
# Plot lines for purple, red, and green points
|
||||
if purple_points:
|
||||
purple_dates, purple_rates = zip(*sorted(purple_points))
|
||||
ax.plot(purple_dates, purple_rates, c="purple", alpha=0.5, linewidth=1)
|
||||
if red_points:
|
||||
red_dates, red_rates = zip(*sorted(red_points))
|
||||
ax.plot(red_dates, red_rates, c="red", alpha=0.5, linewidth=1)
|
||||
if green_points:
|
||||
green_dates, green_rates = zip(*sorted(green_points))
|
||||
ax.plot(green_dates, green_rates, c="green", alpha=0.5, linewidth=1)
|
||||
|
||||
# Plot all points
|
||||
ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
|
||||
|
||||
for i, model in enumerate(models):
|
||||
ax.annotate(
|
||||
model,
|
||||
(dates[i], pass_rates[i]),
|
||||
fontsize=12,
|
||||
fontsize=8,
|
||||
alpha=0.75,
|
||||
xytext=(5, 5),
|
||||
textcoords="offset points",
|
||||
)
|
||||
|
||||
ax.set_xlabel("Model release date", fontsize=18, color="#555")
|
||||
ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18, color="#555")
|
||||
ax.set_ylabel(
|
||||
"Aider code editing benchmark,\npercent completed correctly", fontsize=18, color="#555"
|
||||
)
|
||||
ax.set_title("LLM code editing skill by model release date", fontsize=20)
|
||||
ax.set_ylim(0, 30)
|
||||
plt.xticks(fontsize=14)
|
||||
ax.set_ylim(0, 100) # Adjust y-axis limit to accommodate higher values
|
||||
plt.xticks(fontsize=14, rotation=45, ha="right") # Rotate x-axis labels for better readability
|
||||
plt.tight_layout(pad=3.0)
|
||||
|
||||
print("Debug: Saving figures...")
|
||||
plt.savefig("tmp_over_time.png")
|
||||
plt.savefig("tmp_over_time.svg")
|
||||
|
||||
print("Debug: Displaying figure with imgcat...")
|
||||
imgcat(fig)
|
||||
|
||||
print("Debug: Figure generation complete.")
|
||||
|
||||
|
||||
# Example usage
|
||||
plot_over_time("_data/edit_leaderboard.yml")
|
||||
plot_over_time("aider/website/_data/edit_leaderboard.yml")
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
@ -226,9 +227,10 @@ class TestMain(TestCase):
|
|||
|
||||
def test_main_exit_calls_version_check(self):
|
||||
with GitTemporaryDirectory():
|
||||
with patch("aider.main.check_version") as mock_check_version, patch(
|
||||
"aider.main.InputOutput"
|
||||
) as mock_input_output:
|
||||
with (
|
||||
patch("aider.main.check_version") as mock_check_version,
|
||||
patch("aider.main.InputOutput") as mock_input_output,
|
||||
):
|
||||
main(["--exit"], input=DummyInput(), output=DummyOutput())
|
||||
mock_check_version.assert_called_once()
|
||||
mock_input_output.assert_called_once()
|
||||
|
@ -373,6 +375,67 @@ class TestMain(TestCase):
|
|||
self.assertRegex(relevant_output, r"AIDER_DARK_MODE:\s+on")
|
||||
self.assertRegex(relevant_output, r"dark_mode:\s+True")
|
||||
|
||||
def test_yaml_config_file_loading(self):
|
||||
with GitTemporaryDirectory() as git_dir:
|
||||
git_dir = Path(git_dir)
|
||||
|
||||
# Create fake home directory
|
||||
fake_home = git_dir / "fake_home"
|
||||
fake_home.mkdir()
|
||||
os.environ["HOME"] = str(fake_home)
|
||||
|
||||
# Create subdirectory as current working directory
|
||||
cwd = git_dir / "subdir"
|
||||
cwd.mkdir()
|
||||
os.chdir(cwd)
|
||||
|
||||
# Create .aider.conf.yml files in different locations
|
||||
home_config = fake_home / ".aider.conf.yml"
|
||||
git_config = git_dir / ".aider.conf.yml"
|
||||
cwd_config = cwd / ".aider.conf.yml"
|
||||
named_config = git_dir / "named.aider.conf.yml"
|
||||
|
||||
cwd_config.write_text("model: gpt-4-32k\nmap-tokens: 4096\n")
|
||||
git_config.write_text("model: gpt-4\nmap-tokens: 2048\n")
|
||||
home_config.write_text("model: gpt-3.5-turbo\nmap-tokens: 1024\n")
|
||||
named_config.write_text("model: gpt-4-1106-preview\nmap-tokens: 8192\n")
|
||||
|
||||
with (
|
||||
patch("pathlib.Path.home", return_value=fake_home),
|
||||
patch("aider.coders.Coder.create") as MockCoder,
|
||||
):
|
||||
# Test loading from specified config file
|
||||
main(
|
||||
["--yes", "--exit", "--config", str(named_config)],
|
||||
input=DummyInput(),
|
||||
output=DummyOutput(),
|
||||
)
|
||||
_, kwargs = MockCoder.call_args
|
||||
self.assertEqual(kwargs["main_model"].name, "gpt-4-1106-preview")
|
||||
self.assertEqual(kwargs["map_tokens"], 8192)
|
||||
|
||||
# Test loading from current working directory
|
||||
main(["--yes", "--exit"], input=DummyInput(), output=DummyOutput())
|
||||
_, kwargs = MockCoder.call_args
|
||||
print("kwargs:", kwargs) # Add this line for debugging
|
||||
self.assertIn("main_model", kwargs, "main_model key not found in kwargs")
|
||||
self.assertEqual(kwargs["main_model"].name, "gpt-4-32k")
|
||||
self.assertEqual(kwargs["map_tokens"], 4096)
|
||||
|
||||
# Test loading from git root
|
||||
cwd_config.unlink()
|
||||
main(["--yes", "--exit"], input=DummyInput(), output=DummyOutput())
|
||||
_, kwargs = MockCoder.call_args
|
||||
self.assertEqual(kwargs["main_model"].name, "gpt-4")
|
||||
self.assertEqual(kwargs["map_tokens"], 2048)
|
||||
|
||||
# Test loading from home directory
|
||||
git_config.unlink()
|
||||
main(["--yes", "--exit"], input=DummyInput(), output=DummyOutput())
|
||||
_, kwargs = MockCoder.call_args
|
||||
self.assertEqual(kwargs["main_model"].name, "gpt-3.5-turbo")
|
||||
self.assertEqual(kwargs["map_tokens"], 1024)
|
||||
|
||||
def test_map_tokens_option(self):
|
||||
with GitTemporaryDirectory():
|
||||
with patch("aider.coders.base_coder.RepoMap") as MockRepoMap:
|
||||
|
@ -427,3 +490,27 @@ class TestMain(TestCase):
|
|||
self.assertIn(real_external_file_path, coder.abs_read_only_fnames)
|
||||
finally:
|
||||
os.unlink(external_file_path)
|
||||
|
||||
def test_model_metadata_file(self):
|
||||
with GitTemporaryDirectory():
|
||||
metadata_file = Path(".aider.model.metadata.json")
|
||||
|
||||
# must be a fully qualified model name: provider/...
|
||||
metadata_content = {"deepseek/deepseek-chat": {"max_input_tokens": 1234}}
|
||||
metadata_file.write_text(json.dumps(metadata_content))
|
||||
|
||||
coder = main(
|
||||
[
|
||||
"--model",
|
||||
"deepseek/deepseek-chat",
|
||||
"--model-metadata-file",
|
||||
str(metadata_file),
|
||||
"--exit",
|
||||
"--yes",
|
||||
],
|
||||
input=DummyInput(),
|
||||
output=DummyOutput(),
|
||||
return_coder=True,
|
||||
)
|
||||
|
||||
self.assertEqual(coder.main_model.info["max_input_tokens"], 1234)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue