From 04e816ff2e2de14359e69a7e903357e8697523e8 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 15 Aug 2024 09:49:51 -0700 Subject: [PATCH] copy --- aider/website/_data/code-in-json.yml | 324 +++++++++++------- .../website/_posts/2024-08-14-code-in-json.md | 118 ++++--- 2 files changed, 282 insertions(+), 160 deletions(-) diff --git a/aider/website/_data/code-in-json.yml b/aider/website/_data/code-in-json.yml index 0f2bbcbed..78efd129f 100644 --- a/aider/website/_data/code-in-json.yml +++ b/aider/website/_data/code-in-json.yml @@ -1,7 +1,7 @@ - dirname: 2024-08-15-13-17-11--json-no-lint-gpt-4o-2024-08-06-whole test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: whole + model: gpt-4o-2024-08-06 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -14,15 +14,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 4.3 total_cost: 0.7965 - dirname: 2024-08-15-13-18-36--json-no-lint-gpt-4o-2024-08-06-func test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: func + model: gpt-4o-2024-08-06 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 57.9 percent_cases_well_formed: 100.0 @@ -35,15 +35,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 5.7 total_cost: 0.8417 - dirname: 2024-08-15-13-20-11--json-no-lint-gpt-4o-2024-05-13-whole test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: whole + model: gpt-4o-2024-05-13 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 56.4 percent_cases_well_formed: 100.0 @@ -56,15 +56,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 8.0 total_cost: 1.5034 - dirname: 2024-08-15-13-21-55--json-no-lint-gpt-4o-2024-05-13-func test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: func + model: gpt-4o-2024-05-13 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -77,15 +77,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 7.1 total_cost: 1.2285 - dirname: 2024-08-15-13-23-33--json-no-lint-claude-3.5-sonnet-whole test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: whole + model: claude-3.5-sonnet + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -98,15 +98,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 10.5 total_cost: 1.6714 - dirname: 2024-08-15-13-24-56--json-no-lint-claude-3.5-sonnet-func test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: func + model: claude-3.5-sonnet + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 53.4 percent_cases_well_formed: 100.0 @@ -119,15 +119,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 9.7 total_cost: 1.5980 - dirname: 2024-08-15-13-26-38--json-no-lint-deepseek-coder-whole test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: whole + model: deepseek-coder V2 0724 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 59.4 percent_cases_well_formed: 100.0 @@ -140,15 +140,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 27.9 total_cost: 0.0438 - dirname: 2024-08-15-13-29-55--json-no-lint-deepseek-coder-func test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: func + model: deepseek-coder V2 0724 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 49.6 percent_cases_well_formed: 100.0 @@ -161,15 +161,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 20.5 total_cost: 0.0329 - dirname: 2024-08-15-13-50-03--json-no-lint-gpt-4o-2024-08-06-whole-2 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: whole + model: gpt-4o-2024-08-06 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 61.7 percent_cases_well_formed: 100.0 @@ -182,15 +182,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 4.2 total_cost: 0.7946 - dirname: 2024-08-15-13-51-36--json-no-lint-gpt-4o-2024-08-06-func-2 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: func + model: gpt-4o-2024-08-06 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 56.4 percent_cases_well_formed: 100.0 @@ -203,15 +203,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 6.4 total_cost: 0.8390 - dirname: 2024-08-15-13-53-23--json-no-lint-gpt-4o-2024-05-13-whole-2 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: whole + model: gpt-4o-2024-05-13 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 59.4 percent_cases_well_formed: 100.0 @@ -224,15 +224,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 7.4 total_cost: 1.4996 - dirname: 2024-08-15-13-54-53--json-no-lint-gpt-4o-2024-05-13-func-2 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: func + model: gpt-4o-2024-05-13 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -245,15 +245,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 7.7 total_cost: 1.2210 - dirname: 2024-08-15-13-56-21--json-no-lint-claude-3.5-sonnet-whole-2 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: whole + model: claude-3.5-sonnet + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.9 percent_cases_well_formed: 100.0 @@ -266,15 +266,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 16.5 total_cost: 1.6556 - dirname: 2024-08-15-14-02-15--json-no-lint-claude-3.5-sonnet-func-2 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: func + model: claude-3.5-sonnet + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 51.9 percent_cases_well_formed: 100.0 @@ -287,15 +287,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 14.3 total_cost: 1.5835 - dirname: 2024-08-15-14-06-12--json-no-lint-deepseek-coder-whole-2 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: whole + model: deepseek-coder V2 0724 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.9 percent_cases_well_formed: 100.0 @@ -308,15 +308,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 25.8 total_cost: 0.0439 - dirname: 2024-08-15-14-09-22--json-no-lint-deepseek-coder-func-2 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: func + model: deepseek-coder V2 0724 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 53.4 percent_cases_well_formed: 100.0 @@ -329,15 +329,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 18.8 total_cost: 0.0333 - dirname: 2024-08-15-14-11-45--json-no-lint-gpt-4o-2024-08-06-whole-3 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: whole + model: gpt-4o-2024-08-06 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.9 percent_cases_well_formed: 100.0 @@ -350,15 +350,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 4.3 total_cost: 0.7945 - dirname: 2024-08-15-14-13-11--json-no-lint-gpt-4o-2024-08-06-func-3 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: func + model: gpt-4o-2024-08-06 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 56.4 percent_cases_well_formed: 100.0 @@ -371,15 +371,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 5.6 total_cost: 0.8220 - dirname: 2024-08-15-14-14-40--json-no-lint-gpt-4o-2024-05-13-whole-3 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: whole + model: gpt-4o-2024-05-13 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 61.7 percent_cases_well_formed: 100.0 @@ -392,15 +392,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 8.8 total_cost: 1.4993 - dirname: 2024-08-15-14-16-34--json-no-lint-gpt-4o-2024-05-13-func-3 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: func + model: gpt-4o-2024-05-13 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 58.6 percent_cases_well_formed: 100.0 @@ -413,15 +413,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 8.7 total_cost: 1.2064 - dirname: 2024-08-15-14-17-51--json-no-lint-claude-3.5-sonnet-whole-3 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: whole + model: claude-3.5-sonnet + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -434,15 +434,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 11.0 total_cost: 1.6555 - dirname: 2024-08-15-14-19-19--json-no-lint-claude-3.5-sonnet-func-3 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: func + model: claude-3.5-sonnet + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 51.1 percent_cases_well_formed: 100.0 @@ -455,15 +455,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 10.3 total_cost: 1.5614 - dirname: 2024-08-15-14-21-06--json-no-lint-deepseek-coder-whole-3 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: whole + model: deepseek-coder V2 0724 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 61.7 percent_cases_well_formed: 100.0 @@ -476,15 +476,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 3 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 24.4 total_cost: 0.0439 - dirname: 2024-08-15-14-24-46--json-no-lint-deepseek-coder-func-3 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: func + model: deepseek-coder V2 0724 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 52.6 percent_cases_well_formed: 100.0 @@ -497,15 +497,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 19.0 total_cost: 0.0334 - dirname: 2024-08-15-14-27-17--json-no-lint-gpt-4o-2024-08-06-whole-4 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: whole + model: gpt-4o-2024-08-06 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -518,15 +518,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 4.3 total_cost: 0.8015 - dirname: 2024-08-15-14-28-58--json-no-lint-gpt-4o-2024-08-06-func-4 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: func + model: gpt-4o-2024-08-06 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -539,15 +539,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 6.0 total_cost: 0.8394 - dirname: 2024-08-15-14-30-48--json-no-lint-gpt-4o-2024-05-13-whole-4 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: whole + model: gpt-4o-2024-05-13 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 61.7 percent_cases_well_formed: 100.0 @@ -560,15 +560,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 12.3 total_cost: 1.4919 - dirname: 2024-08-15-14-32-58--json-no-lint-gpt-4o-2024-05-13-func-4 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: func + model: gpt-4o-2024-05-13 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 59.4 percent_cases_well_formed: 100.0 @@ -581,15 +581,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 11.1 total_cost: 1.2120 - dirname: 2024-08-15-14-34-39--json-no-lint-claude-3.5-sonnet-whole-4 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: whole + model: claude-3.5-sonnet + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.9 percent_cases_well_formed: 100.0 @@ -602,15 +602,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 11.3 total_cost: 1.6635 - dirname: 2024-08-15-14-36-18--json-no-lint-claude-3.5-sonnet-func-4 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: func + model: claude-3.5-sonnet + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 55.6 percent_cases_well_formed: 100.0 @@ -623,15 +623,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 10.5 total_cost: 1.5768 - dirname: 2024-08-15-14-38-35--json-no-lint-deepseek-coder-whole-4 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: whole + model: deepseek-coder V2 0724 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 59.4 percent_cases_well_formed: 100.0 @@ -644,15 +644,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 24.5 total_cost: 0.0438 - dirname: 2024-08-15-14-41-36--json-no-lint-deepseek-coder-func-4 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: func + model: deepseek-coder V2 0724 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 49.6 percent_cases_well_formed: 100.0 @@ -665,15 +665,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 18.7 total_cost: 0.0333 - dirname: 2024-08-15-14-44-11--json-no-lint-gpt-4o-2024-08-06-whole-5 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: whole + model: gpt-4o-2024-08-06 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.9 percent_cases_well_formed: 100.0 @@ -686,15 +686,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 4.6 total_cost: 0.8023 - dirname: 2024-08-15-14-45-40--json-no-lint-gpt-4o-2024-08-06-func-5 test_cases: 133 - model: openai/gpt-4o-2024-08-06 - edit_format: func + model: gpt-4o-2024-08-06 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 57.1 percent_cases_well_formed: 100.0 @@ -707,15 +707,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 3 - command: aider --model openai/gpt-4o-2024-08-06 + command: aider --model gpt-4o-2024-08-06 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 6.3 total_cost: 0.8354 - dirname: 2024-08-15-14-47-39--json-no-lint-gpt-4o-2024-05-13-whole-5 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: whole + model: gpt-4o-2024-05-13 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -728,15 +728,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 10.7 total_cost: 1.4982 - dirname: 2024-08-15-14-49-44--json-no-lint-gpt-4o-2024-05-13-func-5 test_cases: 133 - model: openai/gpt-4o-2024-05-13 - edit_format: func + model: gpt-4o-2024-05-13 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 59.4 percent_cases_well_formed: 100.0 @@ -749,15 +749,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openai/gpt-4o-2024-05-13 + command: aider --model gpt-4o-2024-05-13 date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 10.5 total_cost: 1.2099 - dirname: 2024-08-15-14-51-18--json-no-lint-claude-3.5-sonnet-whole-5 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: whole + model: claude-3.5-sonnet + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 60.2 percent_cases_well_formed: 100.0 @@ -770,15 +770,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 11.4 total_cost: 1.6685 - dirname: 2024-08-15-14-52-48--json-no-lint-claude-3.5-sonnet-func-5 test_cases: 133 - model: openrouter/anthropic/claude-3.5-sonnet - edit_format: func + model: claude-3.5-sonnet + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 53.4 percent_cases_well_formed: 100.0 @@ -791,15 +791,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 1 - command: aider --model openrouter/anthropic/claude-3.5-sonnet + command: aider --model claude-3.5-sonnet date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 10.8 total_cost: 1.5786 - dirname: 2024-08-15-14-54-41--json-no-lint-deepseek-coder-whole-5 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: whole + model: deepseek-coder V2 0724 + edit_format: Markdown commit_hash: bac04a2 pass_rate_1: 61.7 percent_cases_well_formed: 100.0 @@ -812,15 +812,15 @@ indentation_errors: 0 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 24.5 total_cost: 0.0439 - dirname: 2024-08-15-14-57-51--json-no-lint-deepseek-coder-func-5 test_cases: 133 - model: openrouter/deepseek/deepseek-coder - edit_format: func + model: deepseek-coder V2 0724 + edit_format: JSON commit_hash: bac04a2 pass_rate_1: 53.4 percent_cases_well_formed: 100.0 @@ -833,8 +833,92 @@ indentation_errors: 1 exhausted_context_windows: 0 test_timeouts: 0 - command: aider --model openrouter/deepseek/deepseek-coder + command: aider --model deepseek-coder date: 2024-08-15 versions: 0.50.2-dev seconds_per_case: 18.5 total_cost: 0.0330 +- dirname: 2024-08-15-15-12-55--json-no-lint-strict-gpt-4o-2024-08-06-func-2 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 57.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 5.9 + total_cost: 0.8216 +- dirname: 2024-08-15-15-14-31--json-no-lint-strict-gpt-4o-2024-08-06-func-3 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 54.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 2 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.3 + total_cost: 0.8410 +- dirname: 2024-08-15-15-16-14--json-no-lint-strict-gpt-4o-2024-08-06-func-4 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 59.4 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 5.9 + total_cost: 0.8203 +- dirname: 2024-08-15-15-17-50--json-no-lint-strict-gpt-4o-2024-08-06-func-5 + test_cases: 133 + model: gpt-4o-2024-08-06 + edit_format: JSON (strict) + commit_hash: bf2d5fe + pass_rate_1: 57.1 + percent_cases_well_formed: 100.0 + error_outputs: 0 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 0 + lazy_comments: 0 + syntax_errors: 1 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 1 + command: aider --model gpt-4o-2024-08-06 + date: 2024-08-15 + versions: 0.50.2-dev + seconds_per_case: 6.1 + total_cost: 0.8415 diff --git a/aider/website/_posts/2024-08-14-code-in-json.md b/aider/website/_posts/2024-08-14-code-in-json.md index 23b58aa33..9f3345971 100644 --- a/aider/website/_posts/2024-08-14-code-in-json.md +++ b/aider/website/_posts/2024-08-14-code-in-json.md @@ -1,6 +1,6 @@ --- -title: LLMs are bad at returning code in json -excerpt: LLMs write worse code if you ask them to return the code wrapped in json (via a tool or function call). +title: LLMs are bad at returning code in JSON +excerpt: LLMs write worse code if you ask them to return the code wrapped in JSON (via a tool or function call). highlight_image: /assets/code-in-json.jpg draft: true nav_exclude: true @@ -9,7 +9,7 @@ nav_exclude: true

{{ page.date | date: "%B %d, %Y" }}

{% endif %} -# LLMs are bad at returning code in json +# LLMs are bad at returning code in JSON @@ -67,7 +67,7 @@ document.addEventListener('DOMContentLoaded', function () { beginAtZero: true, title: { display: true, - text: 'Pass Rate (%)' + text: 'Pass Rate (%, average of 5 runs)' }, max: 70 } @@ -75,7 +75,7 @@ document.addEventListener('DOMContentLoaded', function () { plugins: { title: { display: true, - text: 'Pass rate by model and code return strategy', + text: 'Pass rate by model and code wrapping strategy', font: { size: 16 } @@ -116,20 +116,22 @@ document.addEventListener('DOMContentLoaded', function () { ## Abstract -The newest LLMs have support for returning properly formatted json responses, +The newest LLMs have support for returning properly formatted JSON responses, making it easy for client applications to parse complex responses. This makes it tempting for AI coding applications to use tool function calls or other structured reply formats to receive code from LLMs. Unfortunately, -LLMs write worse code when asked to wrap it in json, harming their ability +LLMs write worse code when asked to wrap it in JSON, harming their ability to correctly solve coding tasks. -Returning code as plain (markdown) text results in lower scores -on a variant of the aider code editing benchmark, often significantly harming coding -performance. +On a variant of the aider code editing benchmark, +JSON-wrapping code +often significantly harms coding +performance +compared to returning code as plain (markdown) text. This holds true across many top coding LLMs, -and even OpenAI's newest gpt-4o-2024-08-06 with "strict" json support -suffers from this code-in-json handicap. +and even OpenAI's newest gpt-4o-2024-08-06 with "strict" JSON support +suffers from this code-in-JSON handicap. ## Introduction @@ -152,8 +154,7 @@ def greeting(): ```` People expect that it would be easier and more reliable to use tool calls, -and parse a nicely formatted json -response: +which would return a structured JSON response: ``` { @@ -165,32 +166,33 @@ response: This has become even more tempting as LLM providers continue to improve their tooling for reliably generating -valid json. +valid JSON. For example, OpenAI recently announced the ability to -[strictly enforce that json responses will be syntactically correct +[strictly enforce that JSON responses will be syntactically correct and conform to a specified schema](https://openai.com/index/introducing-structured-outputs-in-the-api/). -But producing valid (schema compliant) json is not sufficient for this use case. -The json also has to contain valid, high quality code. -And unfortunately, +But producing valid (schema compliant) JSON is not sufficient for this use case. +The JSON also has to contain valid, high quality code. +Unfortunately, LLMs write worse code when they're asked to -wrap it in json. +wrap it in JSON. In some sense this shouldn't be surprising. Just look at the very simple -json example above, with the escaped +JSON example above, with the escaped quotes `\"` and newlines `\n` mixed into the code. -Imagine if the code itself contained json or other quoted strings, +Imagine the additional +complexity +if the code itself contained JSON or other quoted strings, with their own escape sequences. -If you tried to write a program, -would you do a better job +Would *you* write better code by typing it out normally or as a properly escaped -json string? +JSON string? ## Quantifying the benefits of plain text @@ -198,31 +200,33 @@ json string? Previous [aider benchmark results](/2023/07/02/benchmarks.html) showed the superiority of returning code -as plain text coding compared to json-wrapped function calls. +as plain text compared to JSON-wrapped function calls. Those results were obtained over a year ago, against far less capable models. -OpenAI's newly announced support for "strict" json seemed like a good reason to -investigate whether the newest models are still handicapped by json-wrapping code. +OpenAI's newly announced support for "strict" JSON seemed like a good reason to +investigate whether the newest models are still handicapped by JSON-wrapping code. The graph above shows benchmark results from -3 of the strongest code editing models: +4 of the strongest code editing models: -- gpt-4o-2024-08-06 - claude-3-5-sonnet-20240620 - deepseek-coder (V2 0724) +- gpt-4o-2024-05-13 +- gpt-4o-2024-08-06 Each model was given one try to solve [133 practice exercises from the Exercism python repository](/2023/07/02/benchmarks.html#the-benchmark). This is the standard aider "code editing" benchmark, but restricted to a single attempt without a second try to "fix" any errors. -Each model was assessed by the benchmark using two -different strategies for returning code: +The benchmark assessed the models coding ability +using different strategies for returning code: - **Markdown** -- the model returned the whole source code file in standard markdown triple-backtick fences. -- **Tool call** -- the model used a tool function call to return the whole source code file. This requires the LLM to wrap the code in json. +- **JSON** -- the model used a tool function call to return the whole source code file. This requires the LLM to wrap the code in JSON. +- **JSON (strict)** -- the same as the "JSON" strategy, but with `strict=True`. Only gpt-4o-2024-08-06 supports this setting. The markdown strategy is the same as aider's "whole" edit format, where the @@ -238,10 +242,10 @@ def greeting(): ``` ```` -The tool strategy requires the LLM to call the `write_file` function with +The JSON and JSON (strict) strategies required the LLM to call the `write_file` function with two parameters, as shown below. -For maximum simplicity, the LLM didn't even have to specify the filename, -since the benchmark operates only on a single source file. +For maximum simplicity, the LLM didn't have to specify the filename, +since the benchmark operates on one source file at a time. ``` { @@ -250,7 +254,7 @@ since the benchmark operates only on a single source file. } ``` -Both of these formats avoid actually *editing* source files, to keep +These strategies avoid actually *editing* source files, to keep the task as simple as possible. The LLM is able to emit the whole source file intact, @@ -260,9 +264,43 @@ instructions to edit portions of a file. This experimental setup is designed to highlight -the effects of json-wrapping on the LLMs ability to write code to solve a task. +the effects of JSON-wrapping on the LLMs ability to write code to solve a task. +The results in the graph are the average of 5 runs for each +model & strategy combination. ## Results -All 3 models did significantly worse on the benchmark when asked to -return json-wrapped code in a tool function call. +All of the models did worse on the benchmark when asked to +return JSON-wrapped code in a tool function call. +Most did significantly worse, performing far below +the result obtained with the markdown strategy. + +Some noteworthy observations: + +- OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were +close. Using JSON only dropped the score by 0.3 percent, a difference which is +probably within the margin of error for 5 trials. +- The use of OpenAI's new strict mode seemed to harm the results for gpt-4o-2024-08-06 +as compared to non-strict JSON. +Of course, both JSON results were well below the markdown result. +- The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping. + +## Conclusions + +While the quantitative results differ from the similar +[July 2023 experiments](/2023/07/02/benchmarks.html), +the conclusion seems unchanged: LLMs are bad at returning code in JSON. + +OpenAI appears to be making progress in allowing LLMs to return code in +structured JSON responses without harming the code quality. +But it seems premature to consider switching from plain text +to JSON-wrapped code. + + +## Notes on the aider leaderboard + +The results presented here are not directly comparable to results +from the main +[aider LLM leaderboard](https://aider.chat/docs/leaderboards/). +A number of settings were changed to simplify the benchmark +in order to focus on comparing plain text and JSON wrapped code.