diff --git a/aider/website/_data/code-in-json.yml b/aider/website/_data/code-in-json.yml index 78efd129f..d983aefa8 100644 --- a/aider/website/_data/code-in-json.yml +++ b/aider/website/_data/code-in-json.yml @@ -40,27 +40,6 @@ versions: 0.50.2-dev seconds_per_case: 5.7 total_cost: 0.8417 -- dirname: 2024-08-15-13-20-11--json-no-lint-gpt-4o-2024-05-13-whole - test_cases: 133 - model: gpt-4o-2024-05-13 - edit_format: Markdown - commit_hash: bac04a2 - pass_rate_1: 56.4 - percent_cases_well_formed: 100.0 - error_outputs: 0 - num_malformed_responses: 0 - num_with_malformed_responses: 0 - user_asks: 0 - lazy_comments: 0 - syntax_errors: 0 - indentation_errors: 0 - exhausted_context_windows: 0 - test_timeouts: 1 - command: aider --model gpt-4o-2024-05-13 - date: 2024-08-15 - versions: 0.50.2-dev - seconds_per_case: 8.0 - total_cost: 1.5034 - dirname: 2024-08-15-13-21-55--json-no-lint-gpt-4o-2024-05-13-func test_cases: 133 model: gpt-4o-2024-05-13 @@ -208,27 +187,6 @@ versions: 0.50.2-dev seconds_per_case: 6.4 total_cost: 0.8390 -- dirname: 2024-08-15-13-53-23--json-no-lint-gpt-4o-2024-05-13-whole-2 - test_cases: 133 - model: gpt-4o-2024-05-13 - edit_format: Markdown - commit_hash: bac04a2 - pass_rate_1: 59.4 - percent_cases_well_formed: 100.0 - error_outputs: 0 - num_malformed_responses: 0 - num_with_malformed_responses: 0 - user_asks: 0 - lazy_comments: 0 - syntax_errors: 0 - indentation_errors: 0 - exhausted_context_windows: 0 - test_timeouts: 0 - command: aider --model gpt-4o-2024-05-13 - date: 2024-08-15 - versions: 0.50.2-dev - seconds_per_case: 7.4 - total_cost: 1.4996 - dirname: 2024-08-15-13-54-53--json-no-lint-gpt-4o-2024-05-13-func-2 test_cases: 133 model: gpt-4o-2024-05-13 @@ -376,27 +334,6 @@ versions: 0.50.2-dev seconds_per_case: 5.6 total_cost: 0.8220 -- dirname: 2024-08-15-14-14-40--json-no-lint-gpt-4o-2024-05-13-whole-3 - test_cases: 133 - model: gpt-4o-2024-05-13 - edit_format: Markdown - commit_hash: bac04a2 - pass_rate_1: 61.7 - percent_cases_well_formed: 100.0 - error_outputs: 0 - num_malformed_responses: 0 - num_with_malformed_responses: 0 - user_asks: 0 - lazy_comments: 0 - syntax_errors: 6 - indentation_errors: 0 - exhausted_context_windows: 0 - test_timeouts: 1 - command: aider --model gpt-4o-2024-05-13 - date: 2024-08-15 - versions: 0.50.2-dev - seconds_per_case: 8.8 - total_cost: 1.4993 - dirname: 2024-08-15-14-16-34--json-no-lint-gpt-4o-2024-05-13-func-3 test_cases: 133 model: gpt-4o-2024-05-13 @@ -544,27 +481,6 @@ versions: 0.50.2-dev seconds_per_case: 6.0 total_cost: 0.8394 -- dirname: 2024-08-15-14-30-48--json-no-lint-gpt-4o-2024-05-13-whole-4 - test_cases: 133 - model: gpt-4o-2024-05-13 - edit_format: Markdown - commit_hash: bac04a2 - pass_rate_1: 61.7 - percent_cases_well_formed: 100.0 - error_outputs: 0 - num_malformed_responses: 0 - num_with_malformed_responses: 0 - user_asks: 0 - lazy_comments: 0 - syntax_errors: 6 - indentation_errors: 0 - exhausted_context_windows: 0 - test_timeouts: 0 - command: aider --model gpt-4o-2024-05-13 - date: 2024-08-15 - versions: 0.50.2-dev - seconds_per_case: 12.3 - total_cost: 1.4919 - dirname: 2024-08-15-14-32-58--json-no-lint-gpt-4o-2024-05-13-func-4 test_cases: 133 model: gpt-4o-2024-05-13 @@ -712,27 +628,6 @@ versions: 0.50.2-dev seconds_per_case: 6.3 total_cost: 0.8354 -- dirname: 2024-08-15-14-47-39--json-no-lint-gpt-4o-2024-05-13-whole-5 - test_cases: 133 - model: gpt-4o-2024-05-13 - edit_format: Markdown - commit_hash: bac04a2 - pass_rate_1: 60.2 - percent_cases_well_formed: 100.0 - error_outputs: 0 - num_malformed_responses: 0 - num_with_malformed_responses: 0 - user_asks: 0 - lazy_comments: 0 - syntax_errors: 9 - indentation_errors: 0 - exhausted_context_windows: 0 - test_timeouts: 1 - command: aider --model gpt-4o-2024-05-13 - date: 2024-08-15 - versions: 0.50.2-dev - seconds_per_case: 10.7 - total_cost: 1.4982 - dirname: 2024-08-15-14-49-44--json-no-lint-gpt-4o-2024-05-13-func-5 test_cases: 133 model: gpt-4o-2024-05-13 @@ -922,3 +817,108 @@ versions: 0.50.2-dev seconds_per_case: 6.1 total_cost: 0.8415 +- dirname: 2024-08-15-17-36-22--json-no-lint-again-gpt-4o-2024-05-13-whole-1 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 60.2 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 7 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.8 + total_cost: 1.5110 +- dirname: 2024-08-15-17-38-13--json-no-lint-again-gpt-4o-2024-05-13-whole-2 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.0 + total_cost: 1.4954 +- dirname: 2024-08-15-17-40-10--json-no-lint-again-gpt-4o-2024-05-13-whole-3 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 60.9 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 0 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.8 + total_cost: 1.4999 +- dirname: 2024-08-15-17-41-30--json-no-lint-again-gpt-4o-2024-05-13-whole-4 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 58.6 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.4 + total_cost: 1.4848 +- dirname: 2024-08-15-17-43-12--json-no-lint-again-gpt-4o-2024-05-13-whole-5 + test_cases: 133 + model: gpt-4o-2024-05-13 + edit_format: Markdown + commit_hash: ed94379 + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-05-13 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 7.6 + total_cost: 1.4948 diff --git a/aider/website/_includes/code-in-json-syntax.js b/aider/website/_includes/code-in-json-syntax.js index 77d347cda..b315edea9 100644 --- a/aider/website/_includes/code-in-json-syntax.js +++ b/aider/website/_includes/code-in-json-syntax.js @@ -56,7 +56,8 @@ document.addEventListener('DOMContentLoaded', function () { title: { display: true, text: 'Total syntactic errors from 5 runs' - } + }, + max: 35 } }, plugins: { diff --git a/aider/website/_posts/2024-08-14-code-in-json.md b/aider/website/_posts/2024-08-14-code-in-json.md index fe6a63466..6546e1dfa 100644 --- a/aider/website/_posts/2024-08-14-code-in-json.md +++ b/aider/website/_posts/2024-08-14-code-in-json.md @@ -12,155 +12,12 @@ nav_exclude: true # LLMs are bad at returning code in JSON -
- -
- - - - -> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code. -> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call. - ## Abstract Current LLMs have support for returning properly formatted JSON, making it easier for clients to reliably parse complex responses. It therefore seems attractive for -AI coding applications ask LLMs to return code in structure JSON replies. +AI coding applications ask LLMs to return code in structured JSON replies. Unfortunately, LLMs write worse code when asked to wrap it in JSON, harming their ability to correctly solve coding tasks. @@ -172,6 +29,13 @@ This holds true across many top coding LLMs, including OpenAI's latest model gpt-4o-2024-08-06 which has strong JSON support. +{% include code-in-json-benchmark.js %} + +> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code, +> averaged over 5 runs. +> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call. + + ## Introduction A lot of people wonder why aider doesn't use LLM tools for code editing. @@ -244,9 +108,8 @@ capable models. OpenAI's newly announced support for "strict" JSON seemed like a good reason to investigate whether the newest models are still handicapped by JSON-wrapping code. -The graph above shows benchmark -results from -4 of the strongest code editing models: +Four of the strongest code editing models were benchmarked +to assess the impact of JSON-wrapping code: - claude-3-5-sonnet-20240620 - deepseek-coder (V2 0724) @@ -302,15 +165,16 @@ portions of a file. This experimental setup is designed to highlight the effects of JSON-wrapping on the LLMs ability to write code to solve a task. -The results in the graph are the average of 5 runs for each -model & strategy combination. ## Results +Each of the 4 models was benchmarked 5 times using the different +strategies for returning code. ## Overall coding skill -All of the models did worse on the benchmark when asked to +As shown in Figure 1, +all of the models did worse on the benchmark when asked to return JSON-wrapped code in a tool function call. Most did significantly worse, performing far below the result obtained with the markdown strategy. @@ -319,109 +183,29 @@ Some noteworthy observations: - OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were close. Using JSON only dropped the score by 0.3 percent, a difference which is -probably within the margin of error for 5 trials. -- The use of OpenAI's new strict mode seemed to harm the results for gpt-4o-2024-08-06 -as compared to non-strict JSON. +within the margin of error for 5 trials. +- The use of OpenAI's new strict mode offered no improvement +as compared to non-strict JSON. Of course, both JSON results were well below the markdown result. - The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping. ## Syntax errors -
- -
+Figure 2 shows the number of syntactic errors found in the code produced by each +model and code wrapping strategy. +Models tend to make more syntactic errors when asked to wrap code in JSON. - - -> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code. +> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code, +> totaled from 5 runs. > Models tend to make more syntactic errors when asked to wrap code in JSON.