This commit is contained in:
Paul Gauthier 2024-08-15 11:13:20 -07:00
parent 353b631091
commit 679e1b8990
3 changed files with 136 additions and 351 deletions

View file

@ -40,27 +40,6 @@
versions: 0.50.2-dev
seconds_per_case: 5.7
total_cost: 0.8417
- dirname: 2024-08-15-13-20-11--json-no-lint-gpt-4o-2024-05-13-whole
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: bac04a2
pass_rate_1: 56.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 8.0
total_cost: 1.5034
- dirname: 2024-08-15-13-21-55--json-no-lint-gpt-4o-2024-05-13-func
test_cases: 133
model: gpt-4o-2024-05-13
@ -208,27 +187,6 @@
versions: 0.50.2-dev
seconds_per_case: 6.4
total_cost: 0.8390
- dirname: 2024-08-15-13-53-23--json-no-lint-gpt-4o-2024-05-13-whole-2
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: bac04a2
pass_rate_1: 59.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 7.4
total_cost: 1.4996
- dirname: 2024-08-15-13-54-53--json-no-lint-gpt-4o-2024-05-13-func-2
test_cases: 133
model: gpt-4o-2024-05-13
@ -376,27 +334,6 @@
versions: 0.50.2-dev
seconds_per_case: 5.6
total_cost: 0.8220
- dirname: 2024-08-15-14-14-40--json-no-lint-gpt-4o-2024-05-13-whole-3
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: bac04a2
pass_rate_1: 61.7
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 6
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 8.8
total_cost: 1.4993
- dirname: 2024-08-15-14-16-34--json-no-lint-gpt-4o-2024-05-13-func-3
test_cases: 133
model: gpt-4o-2024-05-13
@ -544,27 +481,6 @@
versions: 0.50.2-dev
seconds_per_case: 6.0
total_cost: 0.8394
- dirname: 2024-08-15-14-30-48--json-no-lint-gpt-4o-2024-05-13-whole-4
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: bac04a2
pass_rate_1: 61.7
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 6
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 12.3
total_cost: 1.4919
- dirname: 2024-08-15-14-32-58--json-no-lint-gpt-4o-2024-05-13-func-4
test_cases: 133
model: gpt-4o-2024-05-13
@ -712,27 +628,6 @@
versions: 0.50.2-dev
seconds_per_case: 6.3
total_cost: 0.8354
- dirname: 2024-08-15-14-47-39--json-no-lint-gpt-4o-2024-05-13-whole-5
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: bac04a2
pass_rate_1: 60.2
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 9
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 10.7
total_cost: 1.4982
- dirname: 2024-08-15-14-49-44--json-no-lint-gpt-4o-2024-05-13-func-5
test_cases: 133
model: gpt-4o-2024-05-13
@ -922,3 +817,108 @@
versions: 0.50.2-dev
seconds_per_case: 6.1
total_cost: 0.8415
- dirname: 2024-08-15-17-36-22--json-no-lint-again-gpt-4o-2024-05-13-whole-1
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: ed94379
pass_rate_1: 60.2
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 7
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 6.8
total_cost: 1.5110
- dirname: 2024-08-15-17-38-13--json-no-lint-again-gpt-4o-2024-05-13-whole-2
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: ed94379
pass_rate_1: 60.9
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 7.0
total_cost: 1.4954
- dirname: 2024-08-15-17-40-10--json-no-lint-again-gpt-4o-2024-05-13-whole-3
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: ed94379
pass_rate_1: 60.9
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 6.8
total_cost: 1.4999
- dirname: 2024-08-15-17-41-30--json-no-lint-again-gpt-4o-2024-05-13-whole-4
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: ed94379
pass_rate_1: 58.6
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 7.4
total_cost: 1.4848
- dirname: 2024-08-15-17-43-12--json-no-lint-again-gpt-4o-2024-05-13-whole-5
test_cases: 133
model: gpt-4o-2024-05-13
edit_format: Markdown
commit_hash: ed94379
pass_rate_1: 59.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model gpt-4o-2024-05-13
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 7.6
total_cost: 1.4948

View file

@ -56,7 +56,8 @@ document.addEventListener('DOMContentLoaded', function () {
title: {
display: true,
text: 'Total syntactic errors from 5 runs'
}
},
max: 35
}
},
plugins: {

View file

@ -12,155 +12,12 @@ nav_exclude: true
# LLMs are bad at returning code in JSON
<div id="chartContainer" style="position: relative; height: 0; padding-bottom: 50%; margin-bottom: 20px;">
<canvas id="passRateChart" style="position: absolute; width: 100%; height: 100%;"></canvas>
</div>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script>
document.addEventListener('DOMContentLoaded', function () {
var ctx = document.getElementById('passRateChart').getContext('2d');
var yamlData = {{ site.data.code-in-json | jsonify }};
var models = [...new Set(yamlData.map(item => item.model))].sort();
var editFormats = [...new Set(yamlData.map(item => item.edit_format))];
var datasets = editFormats.map(format => ({
label: format,
data: models.map(model => {
var items = yamlData.filter(d => d.model === model && d.edit_format === format);
if (items.length === 0) return null;
var average = items.reduce((sum, item) => sum + item.pass_rate_1, 0) / items.length;
return parseFloat(average.toFixed(1));
}),
backgroundColor: function(context) {
const format = context.dataset.label;
if (format === 'Markdown') {
return 'rgba(54, 162, 235, 0.8)';
} else if (format.startsWith('JSON')) {
const ctx = context.chart.ctx;
const gradient = ctx.createPattern(createStripedCanvas(format === 'JSON (strict)'), 'repeat');
return gradient;
} else {
return 'rgba(75, 192, 192, 0.8)';
}
},
}));
var data = {
labels: models,
datasets: datasets
};
var config = {
type: 'bar',
data: data,
options: {
responsive: true,
maintainAspectRatio: false,
scales: {
x: {
title: {
display: true,
text: 'Model'
}
},
y: {
beginAtZero: true,
title: {
display: true,
text: 'Pass Rate (%, average of 5 runs)'
},
max: 70
}
},
plugins: {
title: {
display: true,
text: 'Pass rate by model and code wrapping strategy',
font: {
size: 16
}
},
legend: {
position: 'top',
}
}
}
};
// Adjust chart height based on screen width
function adjustChartHeight() {
var container = document.getElementById('chartContainer');
if (window.innerWidth < 600) {
container.style.paddingBottom = '75%'; // Increase height on small screens
} else {
container.style.paddingBottom = '50%'; // Default height
}
}
// Call the function initially and on window resize
adjustChartHeight();
window.addEventListener('resize', adjustChartHeight);
function createStripedCanvas(isStrict) {
const patternCanvas = document.createElement('canvas');
const patternContext = patternCanvas.getContext('2d');
const size = 10;
patternCanvas.width = size;
patternCanvas.height = size;
patternContext.fillStyle = 'rgba(255, 99, 132, 0.8)';
patternContext.fillRect(0, 0, size, size);
if (isStrict) {
patternContext.strokeStyle = 'rgba(255, 255, 255, 0.8)';
patternContext.lineWidth = 0.75;
patternContext.beginPath();
patternContext.moveTo(0, 0);
patternContext.lineTo(size, size);
patternContext.stroke();
}
return patternCanvas;
}
new Chart(ctx, config);
});
function createStripedCanvas(isStrict) {
const patternCanvas = document.createElement('canvas');
const patternContext = patternCanvas.getContext('2d');
const size = 10;
patternCanvas.width = size;
patternCanvas.height = size;
patternContext.fillStyle = 'rgba(255, 99, 132, 0.8)';
patternContext.fillRect(0, 0, size, size);
if (isStrict) {
patternContext.strokeStyle = 'rgba(255, 255, 255, 0.8)';
patternContext.lineWidth = 0.75;
patternContext.beginPath();
patternContext.moveTo(0, 0);
patternContext.lineTo(size, size);
patternContext.stroke();
}
return patternCanvas;
}
</script>
> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code.
> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call.
## Abstract
Current LLMs have support for returning properly formatted JSON,
making it easier for clients to reliably parse complex responses.
It therefore seems attractive for
AI coding applications ask LLMs to return code in structure JSON replies.
AI coding applications ask LLMs to return code in structured JSON replies.
Unfortunately,
LLMs write worse code when asked to wrap it in JSON, harming their ability
to correctly solve coding tasks.
@ -172,6 +29,13 @@ This holds true across many top coding LLMs,
including OpenAI's latest model gpt-4o-2024-08-06
which has strong JSON support.
{% include code-in-json-benchmark.js %}
> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code,
> averaged over 5 runs.
> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call.
## Introduction
A lot of people wonder why aider doesn't use LLM tools for code editing.
@ -244,9 +108,8 @@ capable models.
OpenAI's newly announced support for "strict" JSON seemed like a good reason to
investigate whether the newest models are still handicapped by JSON-wrapping code.
The graph above shows benchmark
results from
4 of the strongest code editing models:
Four of the strongest code editing models were benchmarked
to assess the impact of JSON-wrapping code:
- claude-3-5-sonnet-20240620
- deepseek-coder (V2 0724)
@ -302,15 +165,16 @@ portions of a file.
This experimental setup is designed to highlight
the effects of JSON-wrapping on the LLMs ability to write code to solve a task.
The results in the graph are the average of 5 runs for each
model & strategy combination.
## Results
Each of the 4 models was benchmarked 5 times using the different
strategies for returning code.
## Overall coding skill
All of the models did worse on the benchmark when asked to
As shown in Figure 1,
all of the models did worse on the benchmark when asked to
return JSON-wrapped code in a tool function call.
Most did significantly worse, performing far below
the result obtained with the markdown strategy.
@ -319,109 +183,29 @@ Some noteworthy observations:
- OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were
close. Using JSON only dropped the score by 0.3 percent, a difference which is
probably within the margin of error for 5 trials.
- The use of OpenAI's new strict mode seemed to harm the results for gpt-4o-2024-08-06
as compared to non-strict JSON.
within the margin of error for 5 trials.
- The use of OpenAI's new strict mode offered no improvement
as compared to non-strict JSON.
Of course, both JSON results were well below the markdown result.
- The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping.
## Syntax errors
<div id="syntaxErrorsContainer" style="position: relative; height: 0; padding-bottom: 50%; margin-bottom: 20px;">
<canvas id="syntaxErrorsChart" style="position: absolute; width: 100%; height: 100%;"></canvas>
</div>
Figure 2 shows the number of syntactic errors found in the code produced by each
model and code wrapping strategy.
Models tend to make more syntactic errors when asked to wrap code in JSON.
<script>
document.addEventListener('DOMContentLoaded', function () {
var ctx = document.getElementById('syntaxErrorsChart').getContext('2d');
var yamlData = {{ site.data.code-in-json | jsonify }};
var models = [...new Set(yamlData.map(item => item.model))].sort();
var editFormats = [...new Set(yamlData.map(item => item.edit_format))];
var datasets = editFormats.map(format => ({
label: format,
data: models.map(model => {
var items = yamlData.filter(d => d.model === model && d.edit_format === format);
if (items.length === 0) return null;
var totalErrors = items.reduce((sum, item) => sum + item.syntax_errors + item.indentation_errors, 0);
return totalErrors;
}),
backgroundColor: function(context) {
const format = context.dataset.label;
if (format === 'Markdown') {
return 'rgba(54, 162, 235, 0.8)';
} else if (format.startsWith('JSON')) {
const ctx = context.chart.ctx;
const gradient = ctx.createPattern(createStripedCanvas(format === 'JSON (strict)'), 'repeat');
return gradient;
} else {
return 'rgba(75, 192, 192, 0.8)';
}
},
}));
Sonnet avoided syntactic errors regardless of the code wrapping strategy,
but its benchmark scores in Figure 1 were lower with JSON.
This seems to indicate that JSON-wrapping
does more than simply raise the syntactic difficulty in coding.
It may distract or challenge the model in a way that
reduces its ability to reason about coding problems.
var data = {
labels: models,
datasets: datasets
};
{% include code-in-json-syntax.js %}
var config = {
type: 'bar',
data: data,
options: {
responsive: true,
maintainAspectRatio: false,
scales: {
x: {
title: {
display: true,
text: 'Model'
}
},
y: {
beginAtZero: true,
title: {
display: true,
text: 'Total Syntax + Indentation Errors'
}
}
},
plugins: {
title: {
display: true,
text: 'Syntax and Indentation Errors by Model and Code Wrapping Strategy',
font: {
size: 16
}
},
legend: {
position: 'top',
}
}
}
};
// Adjust chart height based on screen width
function adjustChartHeight() {
var container = document.getElementById('syntaxErrorsContainer');
if (window.innerWidth < 600) {
container.style.paddingBottom = '75%'; // Increase height on small screens
} else {
container.style.paddingBottom = '50%'; // Default height
}
}
// Call the function initially and on window resize
adjustChartHeight();
window.addEventListener('resize', adjustChartHeight);
new Chart(ctx, config);
});
</script>
> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code.
> Figure 2: Number of `SyntaxError` and `IndentationError` errors found in model generated code,
> totaled from 5 runs.
> Models tend to make more syntactic errors when asked to wrap code in JSON.