diff --git a/aider/coders/base_coder.py b/aider/coders/base_coder.py index b4fb86a48..6372d41d6 100755 --- a/aider/coders/base_coder.py +++ b/aider/coders/base_coder.py @@ -1009,7 +1009,8 @@ class Coder: ) except Exception as err: self.io.tool_error(f"Unexpected error: {err}") - traceback.print_exc() + lines = traceback.format_exception(type(err), err, err.__traceback__) + self.io.tool_error("".join(lines)) return finally: if self.mdstream: diff --git a/aider/models.py b/aider/models.py index cad99d1df..4e5bf74ca 100644 --- a/aider/models.py +++ b/aider/models.py @@ -516,7 +516,11 @@ class Model: def token_count(self, messages): if type(messages) is list: - return litellm.token_counter(model=self.name, messages=messages) + try: + return litellm.token_counter(model=self.name, messages=messages) + except Exception as err: + print(f"Unable to count tokens: {err}") + return 0 if not self.tokenizer: return diff --git a/aider/website/_data/code-in-json.yml b/aider/website/_data/code-in-json.yml new file mode 100644 index 000000000..d983aefa8 --- /dev/null +++ b/aider/website/_data/code-in-json.yml @@ -0,0 +1,924 @@ +- dirname: 2024-08-15-13-17-11--json-no-lint-gpt-4o-2024-08-06-whole + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 4.3 + total_cost: 0.7965 +- dirname: 2024-08-15-13-18-36--json-no-lint-gpt-4o-2024-08-06-func + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 57.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 5.7 + total_cost: 0.8417 +- dirname: 2024-08-15-13-21-55--json-no-lint-gpt-4o-2024-05-13-func + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.1 + total_cost: 1.2285 +- dirname: 2024-08-15-13-23-33--json-no-lint-claude-3.5-sonnet-whole + test_cases: 133 + model: claude-3.5-sonnet + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 10.5 + total_cost: 1.6714 +- dirname: 2024-08-15-13-24-56--json-no-lint-claude-3.5-sonnet-func + test_cases: 133 + model: claude-3.5-sonnet + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 53.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 9.7 + total_cost: 1.5980 +- dirname: 2024-08-15-13-26-38--json-no-lint-deepseek-coder-whole + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 27.9 + total_cost: 0.0438 +- dirname: 2024-08-15-13-29-55--json-no-lint-deepseek-coder-func + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 49.6 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 4 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 20.5 + total_cost: 0.0329 +- dirname: 2024-08-15-13-50-03--json-no-lint-gpt-4o-2024-08-06-whole-2 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 61.7 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 4.2 + total_cost: 0.7946 +- dirname: 2024-08-15-13-51-36--json-no-lint-gpt-4o-2024-08-06-func-2 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 56.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.4 + total_cost: 0.8390 +- dirname: 2024-08-15-13-54-53--json-no-lint-gpt-4o-2024-05-13-func-2 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.7 + total_cost: 1.2210 +- dirname: 2024-08-15-13-56-21--json-no-lint-claude-3.5-sonnet-whole-2 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 16.5 + total_cost: 1.6556 +- dirname: 2024-08-15-14-02-15--json-no-lint-claude-3.5-sonnet-func-2 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 51.9 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 14.3 + total_cost: 1.5835 +- dirname: 2024-08-15-14-06-12--json-no-lint-deepseek-coder-whole-2 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 1 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 25.8 + total_cost: 0.0439 +- dirname: 2024-08-15-14-09-22--json-no-lint-deepseek-coder-func-2 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 53.4 + percent_cases_well_formed: 100.0 + error_outputs: 5 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 6 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 18.8 + total_cost: 0.0333 +- dirname: 2024-08-15-14-11-45--json-no-lint-gpt-4o-2024-08-06-whole-3 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 4.3 + total_cost: 0.7945 +- dirname: 2024-08-15-14-13-11--json-no-lint-gpt-4o-2024-08-06-func-3 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 56.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 5.6 + total_cost: 0.8220 +- dirname: 2024-08-15-14-16-34--json-no-lint-gpt-4o-2024-05-13-func-3 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 58.6 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 8.7 + total_cost: 1.2064 +- dirname: 2024-08-15-14-17-51--json-no-lint-claude-3.5-sonnet-whole-3 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 11.0 + total_cost: 1.6555 +- dirname: 2024-08-15-14-19-19--json-no-lint-claude-3.5-sonnet-func-3 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 51.1 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 10.3 + total_cost: 1.5614 +- dirname: 2024-08-15-14-21-06--json-no-lint-deepseek-coder-whole-3 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 61.7 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 24.4 + total_cost: 0.0439 +- dirname: 2024-08-15-14-24-46--json-no-lint-deepseek-coder-func-3 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 52.6 + percent_cases_well_formed: 100.0 + error_outputs: 3 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 12 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 19.0 + total_cost: 0.0334 +- dirname: 2024-08-15-14-27-17--json-no-lint-gpt-4o-2024-08-06-whole-4 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 4.3 + total_cost: 0.8015 +- dirname: 2024-08-15-14-28-58--json-no-lint-gpt-4o-2024-08-06-func-4 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.0 + total_cost: 0.8394 +- dirname: 2024-08-15-14-32-58--json-no-lint-gpt-4o-2024-05-13-func-4 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 2 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 11.1 + total_cost: 1.2120 +- dirname: 2024-08-15-14-34-39--json-no-lint-claude-3.5-sonnet-whole-4 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 11.3 + total_cost: 1.6635 +- dirname: 2024-08-15-14-36-18--json-no-lint-claude-3.5-sonnet-func-4 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 55.6 + percent_cases_well_formed: 100.0 + error_outputs: 1 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 10.5 + total_cost: 1.5768 +- dirname: 2024-08-15-14-38-35--json-no-lint-deepseek-coder-whole-4 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 24.5 + total_cost: 0.0438 +- dirname: 2024-08-15-14-41-36--json-no-lint-deepseek-coder-func-4 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 49.6 + percent_cases_well_formed: 100.0 + error_outputs: 7 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 2 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 18.7 + total_cost: 0.0333 +- dirname: 2024-08-15-14-44-11--json-no-lint-gpt-4o-2024-08-06-whole-5 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 4.6 + total_cost: 0.8023 +- dirname: 2024-08-15-14-45-40--json-no-lint-gpt-4o-2024-08-06-func-5 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 57.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 3 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.3 + total_cost: 0.8354 +- dirname: 2024-08-15-14-49-44--json-no-lint-gpt-4o-2024-05-13-func-5 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 4 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 10.5 + total_cost: 1.2099 +- dirname: 2024-08-15-14-51-18--json-no-lint-claude-3.5-sonnet-whole-5 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 11.4 + total_cost: 1.6685 +- dirname: 2024-08-15-14-52-48--json-no-lint-claude-3.5-sonnet-func-5 + test_cases: 133 + model: claude-3.5-sonnet + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 53.4 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model claude-3.5-sonnet + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 10.8 + total_cost: 1.5786 +- dirname: 2024-08-15-14-54-41--json-no-lint-deepseek-coder-whole-5 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: Markdown + commit_hash: bac04a2 + pass_rate_1: 61.7 + percent_cases_well_formed: 100.0 + error_outputs: 2 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 2 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 24.5 + total_cost: 0.0439 +- dirname: 2024-08-15-14-57-51--json-no-lint-deepseek-coder-func-5 + test_cases: 133 + model: deepseek-coder V2 0724 + edit_format: JSON + commit_hash: bac04a2 + pass_rate_1: 53.4 + percent_cases_well_formed: 100.0 + error_outputs: 5 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 4 + indentation_errors: 1 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model deepseek-coder + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 18.5 + total_cost: 0.0330 +- dirname: 2024-08-15-15-12-55--json-no-lint-strict-gpt-4o-2024-08-06-func-2 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 57.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 5.9 + total_cost: 0.8216 +- dirname: 2024-08-15-15-14-31--json-no-lint-strict-gpt-4o-2024-08-06-func-3 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 54.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 2 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.3 + total_cost: 0.8410 +- dirname: 2024-08-15-15-16-14--json-no-lint-strict-gpt-4o-2024-08-06-func-4 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 5.9 + total_cost: 0.8203 +- dirname: 2024-08-15-15-17-50--json-no-lint-strict-gpt-4o-2024-08-06-func-5 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 57.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.1 + total_cost: 0.8415 +- dirname: 2024-08-15-17-36-22--json-no-lint-again-gpt-4o-2024-05-13-whole-1 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 7 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.8 + total_cost: 1.5110 +- dirname: 2024-08-15-17-38-13--json-no-lint-again-gpt-4o-2024-05-13-whole-2 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.0 + total_cost: 1.4954 +- dirname: 2024-08-15-17-40-10--json-no-lint-again-gpt-4o-2024-05-13-whole-3 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.8 + total_cost: 1.4999 +- dirname: 2024-08-15-17-41-30--json-no-lint-again-gpt-4o-2024-05-13-whole-4 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 58.6 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.4 + total_cost: 1.4848 +- dirname: 2024-08-15-17-43-12--json-no-lint-again-gpt-4o-2024-05-13-whole-5 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.6 + total_cost: 1.4948 diff --git a/aider/website/_includes/code-in-json-benchmark.js b/aider/website/_includes/code-in-json-benchmark.js new file mode 100644 index 000000000..0a8f75e74 --- /dev/null +++ b/aider/website/_includes/code-in-json-benchmark.js @@ -0,0 +1,170 @@ +
{{ page.date | date: "%B %d, %Y" }}
+{% endif %} + +# LLMs are bad at returning code in JSON + + +Current LLMs have support for returning properly formatted JSON, +making it easier for clients to reliably parse complex responses. +It therefore seems attractive for +AI coding applications ask LLMs to return code in structured JSON replies. +Unfortunately, +LLMs write worse code when asked to wrap it in JSON, harming their ability +to correctly solve coding tasks. +On a variant of the aider code editing benchmark, +asking for JSON-wrapped code +often harms coding performance. +This holds true across many top coding LLMs, +including OpenAI's latest model gpt-4o-2024-08-06 +which has strong JSON support. + +{% include code-in-json-benchmark.js %} + +> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code, +> averaged over 5 runs. +> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call. + + +## Background + +A lot of people wonder why aider doesn't use LLM tools for code editing. +Instead, aider asks for code edits in plain text, like this: + +```` +greeting.py +``` +<<<<<<< SEARCH +def greeting(): + print("Hello") +======= +def greeting(): + print("Goodbye") +>>>>>>> REPLACE +``` +```` + +People expect that it would be easier and more reliable to use tool calls, +which would return a structured JSON response: + +```json +{ + "filename": "greeting.py", + "search": "def greeting():\n print(\"Hello\")\n" + "replace": "def greeting():\n print(\"Goodbye\")\n" +} +``` + +This has become even more tempting as LLM providers +continue to improve their tooling for reliably generating JSON. +For example, +[OpenAI recently announced](https://openai.com/index/introducing-structured-outputs-in-the-api/) +the ability to +strictly enforce that JSON responses will be syntactically correct +and conform to a specified schema. + + +But producing valid (schema compliant) JSON is not sufficient for working with AI generated code. +The code inside the JSON has to correctly solve the requested task +and be free from syntax errors. +Unfortunately, +LLMs write worse code when they're asked to +wrap it in JSON. + +In some sense this shouldn't be surprising. +Just look at the very simple +JSON example above, with the escaped +quotes `\"` and +newlines `\n` +mixed into the code. +Imagine the additional +complexity +if the code itself contained quoted strings +with their +own escape sequences. + +Would *you* write better code by +typing it out normally +or as a properly escaped +JSON string? + + +## Quantifying the benefits of plain text + +Previous [aider benchmark results](/2023/07/02/benchmarks.html) +showed +the superiority of returning code +as plain text compared to JSON-wrapped function calls. +Those results were obtained +over a year ago, against far less +capable models. +OpenAI's newly announced support for "strict" JSON seemed like a good reason to +investigate whether the newest models are still handicapped by JSON-wrapping code. + +The results presented here were based on +the +[aider "code editing" benchmark](/2023/07/02/benchmarks.html#the-benchmark) +of 133 practice exercises from the Exercism python repository. +Models were +restricted to a single attempt, +without a second try to fix errors as is normal in the aider benchmark. + +The performance of each model was compared across different strategies for returning code: + +- **Markdown** -- the model returned the whole source code file in standard markdown triple-backtick fences. +- **JSON** -- the model used a tool function call to return the whole source code file. This required the LLM to wrap the code in JSON. +- **JSON (strict)** -- the same as the "JSON" strategy, but with `strict=True`. Only gpt-4o-2024-08-06 supports this setting. + +The markdown strategy is the same as +aider's "whole" edit format, where the +LLM returns a source file like this: + +```` +Here is the program you asked for which prints "Hello": + +greeting.py +``` +def greeting(): + print("Hello") +``` +```` + +The JSON and JSON (strict) strategies required the LLM to call the `write_file` function with +two parameters, as shown below. +For maximum simplicity, the LLM didn't have to specify the filename, +since the benchmark operates on one source file at a time. + +```json +{ + "explanation": "Here is the program you asked for which prints \"Hello\"", + "content": "def greeting():\n print(\"Hello\")\n" +} +``` + +These strategies avoid actually *editing* source files, to keep +the task as +simple as possible. +The LLM is able to emit the whole source file intact, +which is much easier +than correctly formulating +instructions to edit +portions of a file. + +This experimental setup is designed to quantify +the effects of JSON-wrapping on the LLMs ability to write code to solve a task. + +## Results + +Four of the strongest code editing models were benchmarked +to assess the impact of JSON-wrapping code: + +- claude-3-5-sonnet-20240620 +- deepseek-coder (V2 0724) +- gpt-4o-2024-05-13 +- gpt-4o-2024-08-06 + +Each combination of model and code wrapping strategy was benchmarked 5 times. + +## Overall coding skill + +As shown in Figure 1, +all of the models did worse on the benchmark when asked to +return JSON-wrapped code in a tool function call. +Most did significantly worse, performing far below +the result obtained with the markdown strategy. + +Some noteworthy observations: + +- OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were +close. Using JSON only dropped the score by 0.3 percent, a difference which is +within the margin of error for 5 trials. +- The use of OpenAI's new strict mode offered no improvement +as compared to non-strict JSON. +Of course, both JSON results were well below the markdown result. +- The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping. + +## Syntax errors + +Models tend to make more syntax errors when asked to wrap code in JSON. +Figure 2 shows the number of syntax errors found in the code produced by each +model and code wrapping strategy, +totaling up `SyntaxError` and `IndentationError` errors from all 5 runs. + + +Sonnet's results seems to indicate that the negative effects of JSON-wrapping +go beyond syntactic difficulties. +Sonnet avoided syntax errors regardless of the code wrapping strategy, +but its benchmark scores in Figure 1 were nonetheless lower with JSON. +This implies that JSON-wrapping may distract or challenge models in a way that +reduces their ability to reason about solving coding problems. + +{% include code-in-json-syntax.js %} + +> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code, +> totaled from 5 runs. +> Models tend to make more syntax and formatting errors when asked to wrap code in JSON. + + +## Conclusions + +While the quantitative results differ from the similar +[July 2023 experiments](/2023/07/02/benchmarks.html), +the conclusion seems unchanged: LLMs are bad at returning code in JSON. + +OpenAI appears to be making progress in allowing LLMs to return code in +structured JSON responses without harming the code quality. +But it still seems premature to consider switching from plain text +to JSON-wrapped code. + + +## Notes on the aider leaderboard + +The results presented here are not directly comparable to results +from the main +[aider LLM leaderboard](https://aider.chat/docs/leaderboards/). +A number of settings were changed to simplify the benchmark +in order to focus on comparing plain text and JSON-wrapped code. diff --git a/aider/website/assets/models-over-time.svg b/aider/website/assets/models-over-time.svg index a4fe87061..8fd066630 100644 --- a/aider/website/assets/models-over-time.svg +++ b/aider/website/assets/models-over-time.svg @@ -6,7 +6,7 @@