copy

2025-05-30 17:24:59 +00:00 · 2024-08-15 11:13:20 -07:00 · 2024-08-15 11:13:20 -07:00 · 679e1b8990
commit 679e1b8990
parent 353b631091
3 changed files with 136 additions and 351 deletions
--- a/aider/website/_data/code-in-json.yml
+++ b/aider/website/_data/code-in-json.yml
@ -40,27 +40,6 @@
  versions: 0.50.2-dev
  seconds_per_case: 5.7
  total_cost: 0.8417
- dirname: 2024-08-15-13-20-11--json-no-lint-gpt-4o-2024-05-13-whole
-  test_cases: 133
-  model: gpt-4o-2024-05-13
-  edit_format: Markdown
-  commit_hash: bac04a2
-  pass_rate_1: 56.4
-  percent_cases_well_formed: 100.0
-  error_outputs: 0
-  num_malformed_responses: 0
-  num_with_malformed_responses: 0
-  user_asks: 0
-  lazy_comments: 0
-  syntax_errors: 0
-  indentation_errors: 0
-  exhausted_context_windows: 0
-  test_timeouts: 1
-  command: aider --model gpt-4o-2024-05-13
-  date: 2024-08-15
-  versions: 0.50.2-dev
-  seconds_per_case: 8.0
-  total_cost: 1.5034
 - dirname: 2024-08-15-13-21-55--json-no-lint-gpt-4o-2024-05-13-func
  test_cases: 133
  model: gpt-4o-2024-05-13
@ -208,27 +187,6 @@
  versions: 0.50.2-dev
  seconds_per_case: 6.4
  total_cost: 0.8390
- dirname: 2024-08-15-13-53-23--json-no-lint-gpt-4o-2024-05-13-whole-2
-  test_cases: 133
-  model: gpt-4o-2024-05-13
-  edit_format: Markdown
-  commit_hash: bac04a2
-  pass_rate_1: 59.4
-  percent_cases_well_formed: 100.0
-  error_outputs: 0
-  num_malformed_responses: 0
-  num_with_malformed_responses: 0
-  user_asks: 0
-  lazy_comments: 0
-  syntax_errors: 0
-  indentation_errors: 0
-  exhausted_context_windows: 0
-  test_timeouts: 0
-  command: aider --model gpt-4o-2024-05-13
-  date: 2024-08-15
-  versions: 0.50.2-dev
-  seconds_per_case: 7.4
-  total_cost: 1.4996
 - dirname: 2024-08-15-13-54-53--json-no-lint-gpt-4o-2024-05-13-func-2
  test_cases: 133
  model: gpt-4o-2024-05-13
@ -376,27 +334,6 @@
  versions: 0.50.2-dev
  seconds_per_case: 5.6
  total_cost: 0.8220
- dirname: 2024-08-15-14-14-40--json-no-lint-gpt-4o-2024-05-13-whole-3
-  test_cases: 133
-  model: gpt-4o-2024-05-13
-  edit_format: Markdown
-  commit_hash: bac04a2
-  pass_rate_1: 61.7
-  percent_cases_well_formed: 100.0
-  error_outputs: 0
-  num_malformed_responses: 0
-  num_with_malformed_responses: 0
-  user_asks: 0
-  lazy_comments: 0
-  syntax_errors: 6
-  indentation_errors: 0
-  exhausted_context_windows: 0
-  test_timeouts: 1
-  command: aider --model gpt-4o-2024-05-13
-  date: 2024-08-15
-  versions: 0.50.2-dev
-  seconds_per_case: 8.8
-  total_cost: 1.4993
 - dirname: 2024-08-15-14-16-34--json-no-lint-gpt-4o-2024-05-13-func-3
  test_cases: 133
  model: gpt-4o-2024-05-13
@ -544,27 +481,6 @@
  versions: 0.50.2-dev
  seconds_per_case: 6.0
  total_cost: 0.8394
- dirname: 2024-08-15-14-30-48--json-no-lint-gpt-4o-2024-05-13-whole-4
-  test_cases: 133
-  model: gpt-4o-2024-05-13
-  edit_format: Markdown
-  commit_hash: bac04a2
-  pass_rate_1: 61.7
-  percent_cases_well_formed: 100.0
-  error_outputs: 0
-  num_malformed_responses: 0
-  num_with_malformed_responses: 0
-  user_asks: 0
-  lazy_comments: 0
-  syntax_errors: 6
-  indentation_errors: 0
-  exhausted_context_windows: 0
-  test_timeouts: 0
-  command: aider --model gpt-4o-2024-05-13
-  date: 2024-08-15
-  versions: 0.50.2-dev
-  seconds_per_case: 12.3
-  total_cost: 1.4919
 - dirname: 2024-08-15-14-32-58--json-no-lint-gpt-4o-2024-05-13-func-4
  test_cases: 133
  model: gpt-4o-2024-05-13
@ -712,27 +628,6 @@
  versions: 0.50.2-dev
  seconds_per_case: 6.3
  total_cost: 0.8354
- dirname: 2024-08-15-14-47-39--json-no-lint-gpt-4o-2024-05-13-whole-5
-  test_cases: 133
-  model: gpt-4o-2024-05-13
-  edit_format: Markdown
-  commit_hash: bac04a2
-  pass_rate_1: 60.2
-  percent_cases_well_formed: 100.0
-  error_outputs: 0
-  num_malformed_responses: 0
-  num_with_malformed_responses: 0
-  user_asks: 0
-  lazy_comments: 0
-  syntax_errors: 9
-  indentation_errors: 0
-  exhausted_context_windows: 0
-  test_timeouts: 1
-  command: aider --model gpt-4o-2024-05-13
-  date: 2024-08-15
-  versions: 0.50.2-dev
-  seconds_per_case: 10.7
-  total_cost: 1.4982
 - dirname: 2024-08-15-14-49-44--json-no-lint-gpt-4o-2024-05-13-func-5
  test_cases: 133
  model: gpt-4o-2024-05-13
@ -922,3 +817,108 @@
  versions: 0.50.2-dev
  seconds_per_case: 6.1
  total_cost: 0.8415
+- dirname: 2024-08-15-17-36-22--json-no-lint-again-gpt-4o-2024-05-13-whole-1
+  test_cases: 133
+  model: gpt-4o-2024-05-13
+  edit_format: Markdown
+  commit_hash: ed94379
+  pass_rate_1: 60.2
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 0
+  lazy_comments: 0
+  syntax_errors: 7
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model gpt-4o-2024-05-13
+  date: 2024-08-15
+  versions: 0.50.2-dev
+  seconds_per_case: 6.8
+  total_cost: 1.5110
+- dirname: 2024-08-15-17-38-13--json-no-lint-again-gpt-4o-2024-05-13-whole-2
+  test_cases: 133
+  model: gpt-4o-2024-05-13
+  edit_format: Markdown
+  commit_hash: ed94379
+  pass_rate_1: 60.9
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 0
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model gpt-4o-2024-05-13
+  date: 2024-08-15
+  versions: 0.50.2-dev
+  seconds_per_case: 7.0
+  total_cost: 1.4954
+- dirname: 2024-08-15-17-40-10--json-no-lint-again-gpt-4o-2024-05-13-whole-3
+  test_cases: 133
+  model: gpt-4o-2024-05-13
+  edit_format: Markdown
+  commit_hash: ed94379
+  pass_rate_1: 60.9
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 0
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 0
+  command: aider --model gpt-4o-2024-05-13
+  date: 2024-08-15
+  versions: 0.50.2-dev
+  seconds_per_case: 6.8
+  total_cost: 1.4999
+- dirname: 2024-08-15-17-41-30--json-no-lint-again-gpt-4o-2024-05-13-whole-4
+  test_cases: 133
+  model: gpt-4o-2024-05-13
+  edit_format: Markdown
+  commit_hash: ed94379
+  pass_rate_1: 58.6
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 0
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model gpt-4o-2024-05-13
+  date: 2024-08-15
+  versions: 0.50.2-dev
+  seconds_per_case: 7.4
+  total_cost: 1.4848
+- dirname: 2024-08-15-17-43-12--json-no-lint-again-gpt-4o-2024-05-13-whole-5
+  test_cases: 133
+  model: gpt-4o-2024-05-13
+  edit_format: Markdown
+  commit_hash: ed94379
+  pass_rate_1: 59.4
+  percent_cases_well_formed: 100.0
+  error_outputs: 0
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 0
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 0
+  test_timeouts: 1
+  command: aider --model gpt-4o-2024-05-13
+  date: 2024-08-15
+  versions: 0.50.2-dev
+  seconds_per_case: 7.6
+  total_cost: 1.4948
--- a/aider/website/_includes/code-in-json-syntax.js
+++ b/aider/website/_includes/code-in-json-syntax.js
@ -56,7 +56,8 @@ document.addEventListener('DOMContentLoaded', function () {
                    title: {
                        display: true,
                        text: 'Total syntactic errors from 5 runs'
-                    }
+                    },
+                    max: 35
                }
            },
            plugins: {
--- a/aider/website/_posts/2024-08-14-code-in-json.md
+++ b/aider/website/_posts/2024-08-14-code-in-json.md
@ -12,155 +12,12 @@ nav_exclude: true
 # LLMs are bad at returning code in JSON


-<div id="chartContainer" style="position: relative; height: 0; padding-bottom: 50%; margin-bottom: 20px;">
-    <canvas id="passRateChart" style="position: absolute; width: 100%; height: 100%;"></canvas>
-</div>
-
-<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
-<script>
-document.addEventListener('DOMContentLoaded', function () {
-    var ctx = document.getElementById('passRateChart').getContext('2d');
-    
-    var yamlData = {{ site.data.code-in-json | jsonify }};
-    
-    var models = [...new Set(yamlData.map(item => item.model))].sort();
-    var editFormats = [...new Set(yamlData.map(item => item.edit_format))];
-    
-    var datasets = editFormats.map(format => ({
-        label: format,
-        data: models.map(model => {
-            var items = yamlData.filter(d => d.model === model && d.edit_format === format);
-            if (items.length === 0) return null;
-            var average = items.reduce((sum, item) => sum + item.pass_rate_1, 0) / items.length;
-            return parseFloat(average.toFixed(1));
-        }),
-        backgroundColor: function(context) {
-            const format = context.dataset.label;
-            if (format === 'Markdown') {
-                return 'rgba(54, 162, 235, 0.8)';
-            } else if (format.startsWith('JSON')) {
-                const ctx = context.chart.ctx;
-                const gradient = ctx.createPattern(createStripedCanvas(format === 'JSON (strict)'), 'repeat');
-                return gradient;
-            } else {
-                return 'rgba(75, 192, 192, 0.8)';
-            }
-        },
-    }));
-
-    var data = {
-        labels: models,
-        datasets: datasets
-    };
-
-    var config = {
-        type: 'bar',
-        data: data,
-        options: {
-            responsive: true,
-            maintainAspectRatio: false,
-            scales: {
-                x: {
-                    title: {
-                        display: true,
-                        text: 'Model'
-                    }
-                },
-                y: {
-                    beginAtZero: true,
-                    title: {
-                        display: true,
-                        text: 'Pass Rate (%, average of 5 runs)'
-                    },
-                    max: 70
-                }
-            },
-            plugins: {
-                title: {
-                    display: true,
-                    text: 'Pass rate by model and code wrapping strategy',
-                    font: {
-                        size: 16
-                    }
-                },
-                legend: {
-                    position: 'top',
-                }
-            }
-        }
-    };
-
-    // Adjust chart height based on screen width
-    function adjustChartHeight() {
-        var container = document.getElementById('chartContainer');
-        if (window.innerWidth < 600) {
-            container.style.paddingBottom = '75%'; // Increase height on small screens
-        } else {
-            container.style.paddingBottom = '50%'; // Default height
-        }
-    }
-
-    // Call the function initially and on window resize
-    adjustChartHeight();
-    window.addEventListener('resize', adjustChartHeight);
-
-    function createStripedCanvas(isStrict) {
-        const patternCanvas = document.createElement('canvas');
-        const patternContext = patternCanvas.getContext('2d');
-        const size = 10;
-        patternCanvas.width = size;
-        patternCanvas.height = size;
-
-        patternContext.fillStyle = 'rgba(255, 99, 132, 0.8)';
-        patternContext.fillRect(0, 0, size, size);
-
-        if (isStrict) {
-            patternContext.strokeStyle = 'rgba(255, 255, 255, 0.8)';
-            patternContext.lineWidth = 0.75;
-            patternContext.beginPath();
-            patternContext.moveTo(0, 0);
-            patternContext.lineTo(size, size);
-            patternContext.stroke();
-        }
-
-        return patternCanvas;
-    }
-
-    new Chart(ctx, config);
-});
-
-function createStripedCanvas(isStrict) {
-    const patternCanvas = document.createElement('canvas');
-    const patternContext = patternCanvas.getContext('2d');
-    const size = 10;
-    patternCanvas.width = size;
-    patternCanvas.height = size;
-
-    patternContext.fillStyle = 'rgba(255, 99, 132, 0.8)';
-    patternContext.fillRect(0, 0, size, size);
-
-    if (isStrict) {
-        patternContext.strokeStyle = 'rgba(255, 255, 255, 0.8)';
-        patternContext.lineWidth = 0.75;
-        patternContext.beginPath();
-        patternContext.moveTo(0, 0);
-        patternContext.lineTo(size, size);
-        patternContext.stroke();
-    }
-
-    return patternCanvas;
-}
-</script>
-
-> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code.
-> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call.
-
 ## Abstract

 Current LLMs have support for returning properly formatted JSON,
 making it easier for clients to reliably parse complex responses.
 It therefore seems attractive for
-AI coding applications ask LLMs to return code in structure JSON replies.
+AI coding applications ask LLMs to return code in structured JSON replies.
 Unfortunately, 
 LLMs write worse code when asked to wrap it in JSON, harming their ability
 to correctly solve coding tasks.
@ -172,6 +29,13 @@ This holds true across many top coding LLMs,
 including OpenAI's latest model gpt-4o-2024-08-06 
 which has strong JSON support.

+{% include code-in-json-benchmark.js %}
+
+> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code,
+> averaged over 5 runs.
+> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call.
+
+
 ## Introduction

 A lot of people wonder why aider doesn't use LLM tools for code editing.
@ -244,9 +108,8 @@ capable models.
 OpenAI's newly announced support for "strict" JSON seemed like a good reason to
 investigate whether the newest models are still handicapped by JSON-wrapping code.

-The graph above shows benchmark
-results from 
-4 of the strongest code editing models:
+Four of the strongest code editing models were benchmarked
+to assess the impact of JSON-wrapping code:

 - claude-3-5-sonnet-20240620
 - deepseek-coder (V2 0724)
@ -302,15 +165,16 @@ portions of a file.

 This experimental setup is designed to highlight
 the effects of JSON-wrapping on the LLMs ability to write code to solve a task.
-The results in the graph are the average of 5 runs for each
-model & strategy combination.

 ## Results

+Each of the 4 models was benchmarked 5 times using the different
+strategies for returning code.

 ## Overall coding skill

-All of the models did worse on the benchmark when asked to
+As shown in Figure 1, 
+all of the models did worse on the benchmark when asked to
 return JSON-wrapped code in a tool function call.
 Most did significantly worse, performing far below
 the result obtained with the markdown strategy.
@ -319,109 +183,29 @@ Some noteworthy observations:

 - OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were
 close. Using JSON only dropped the score by 0.3 percent, a difference which is
-probably within the margin of error for 5 trials.
- The use of OpenAI's new strict mode seemed to harm the results for gpt-4o-2024-08-06
-as compared to non-strict JSON. 
+within the margin of error for 5 trials.
+- The use of OpenAI's new strict mode offered no improvement
+as compared to non-strict JSON.
 Of course, both JSON results were well below the markdown result.
 - The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping.

 ## Syntax errors

-<div id="syntaxErrorsContainer" style="position: relative; height: 0; padding-bottom: 50%; margin-bottom: 20px;">
-    <canvas id="syntaxErrorsChart" style="position: absolute; width: 100%; height: 100%;"></canvas>
-</div>
+Figure 2 shows the number of syntactic errors found in the code produced by each
+model and code wrapping strategy.
+Models tend to make more syntactic errors when asked to wrap code in JSON.

-<script>
-document.addEventListener('DOMContentLoaded', function () {
-    var ctx = document.getElementById('syntaxErrorsChart').getContext('2d');
-    
-    var yamlData = {{ site.data.code-in-json | jsonify }};
-    
-    var models = [...new Set(yamlData.map(item => item.model))].sort();
-    var editFormats = [...new Set(yamlData.map(item => item.edit_format))];
-    
-    var datasets = editFormats.map(format => ({
-        label: format,
-        data: models.map(model => {
-            var items = yamlData.filter(d => d.model === model && d.edit_format === format);
-            if (items.length === 0) return null;
-            var totalErrors = items.reduce((sum, item) => sum + item.syntax_errors + item.indentation_errors, 0);
-            return totalErrors;
-        }),
-        backgroundColor: function(context) {
-            const format = context.dataset.label;
-            if (format === 'Markdown') {
-                return 'rgba(54, 162, 235, 0.8)';
-            } else if (format.startsWith('JSON')) {
-                const ctx = context.chart.ctx;
-                const gradient = ctx.createPattern(createStripedCanvas(format === 'JSON (strict)'), 'repeat');
-                return gradient;
-            } else {
-                return 'rgba(75, 192, 192, 0.8)';
-            }
-        },
-    }));
+Sonnet avoided syntactic errors regardless of the code wrapping strategy,
+but its benchmark scores in Figure 1 were lower with JSON.
+This seems to indicate that JSON-wrapping 
+does more than simply raise the syntactic difficulty in coding.
+It may distract or challenge the model in a way that
+reduces its ability to reason about coding problems.

-    var data = {
-        labels: models,
-        datasets: datasets
-    };
+{% include code-in-json-syntax.js %}

-    var config = {
-        type: 'bar',
-        data: data,
-        options: {
-            responsive: true,
-            maintainAspectRatio: false,
-            scales: {
-                x: {
-                    title: {
-                        display: true,
-                        text: 'Model'
-                    }
-                },
-                y: {
-                    beginAtZero: true,
-                    title: {
-                        display: true,
-                        text: 'Total Syntax + Indentation Errors'
-                    }
-                }
-            },
-            plugins: {
-                title: {
-                    display: true,
-                    text: 'Syntax and Indentation Errors by Model and Code Wrapping Strategy',
-                    font: {
-                        size: 16
-                    }
-                },
-                legend: {
-                    position: 'top',
-                }
-            }
-        }
-    };
-
-    // Adjust chart height based on screen width
-    function adjustChartHeight() {
-        var container = document.getElementById('syntaxErrorsContainer');
-        if (window.innerWidth < 600) {
-            container.style.paddingBottom = '75%'; // Increase height on small screens
-        } else {
-            container.style.paddingBottom = '50%'; // Default height
-        }
-    }
-
-    // Call the function initially and on window resize
-    adjustChartHeight();
-    window.addEventListener('resize', adjustChartHeight);
-
-    new Chart(ctx, config);
-});
-</script>
-
-> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code.
+> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code,
+> totaled from 5 runs.
 > Models tend to make more syntactic errors when asked to wrap code in JSON.