Merge branch 'main' into json-coders

This commit is contained in:
Paul Gauthier 2024-08-16 11:31:55 -07:00
commit d3e37c9e36
10 changed files with 425 additions and 355 deletions

View file

@ -1,6 +1,11 @@
# Release history # Release history
### main branch
- Improved editing performance on Jupyter Notebook `.ipynb` files.
- Work around litellm tokenizer bug for images.
### Aider v0.50.1 ### Aider v0.50.1
- Bugfix for provider API exceptions. - Bugfix for provider API exceptions.

View file

@ -1,5 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import base64
import hashlib import hashlib
import json import json
import locale import locale
@ -652,9 +653,11 @@ class Coder:
image_messages = [] image_messages = []
for fname, content in self.get_abs_fnames_content(): for fname, content in self.get_abs_fnames_content():
if is_image_file(fname): if is_image_file(fname):
with open(fname, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
mime_type, _ = mimetypes.guess_type(fname) mime_type, _ = mimetypes.guess_type(fname)
if mime_type and mime_type.startswith("image/"): if mime_type and mime_type.startswith("image/"):
image_url = f"data:{mime_type};base64,{content}" image_url = f"data:{mime_type};base64,{encoded_string}"
rel_fname = self.get_rel_fname(fname) rel_fname = self.get_rel_fname(fname)
image_messages += [ image_messages += [
{"type": "text", "text": f"Image file: {rel_fname}"}, {"type": "text", "text": f"Image file: {rel_fname}"},

View file

@ -16,6 +16,11 @@ cog.out(text)
# Release history # Release history
### main branch
- Improved editing performance on Jupyter Notebook `.ipynb` files.
- Work around litellm tokenizer bug for images.
### Aider v0.50.1 ### Aider v0.50.1
- Bugfix for provider API exceptions. - Bugfix for provider API exceptions.

View file

@ -82,27 +82,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 10.5 seconds_per_case: 10.5
total_cost: 1.6714 total_cost: 1.6714
- dirname: 2024-08-15-13-24-56--json-no-lint-claude-3.5-sonnet-func
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 9.7
total_cost: 1.5980
- dirname: 2024-08-15-13-26-38--json-no-lint-deepseek-coder-whole - dirname: 2024-08-15-13-26-38--json-no-lint-deepseek-coder-whole
test_cases: 133 test_cases: 133
model: deepseek-coder V2 0724 model: deepseek-coder V2 0724
@ -124,27 +103,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 27.9 seconds_per_case: 27.9
total_cost: 0.0438 total_cost: 0.0438
- dirname: 2024-08-15-13-29-55--json-no-lint-deepseek-coder-func
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 49.6
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 4
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 20.5
total_cost: 0.0329
- dirname: 2024-08-15-13-50-03--json-no-lint-gpt-4o-2024-08-06-whole-2 - dirname: 2024-08-15-13-50-03--json-no-lint-gpt-4o-2024-08-06-whole-2
test_cases: 133 test_cases: 133
model: gpt-4o-2024-08-06 model: gpt-4o-2024-08-06
@ -229,27 +187,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 16.5 seconds_per_case: 16.5
total_cost: 1.6556 total_cost: 1.6556
- dirname: 2024-08-15-14-02-15--json-no-lint-claude-3.5-sonnet-func-2
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 51.9
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 14.3
total_cost: 1.5835
- dirname: 2024-08-15-14-06-12--json-no-lint-deepseek-coder-whole-2 - dirname: 2024-08-15-14-06-12--json-no-lint-deepseek-coder-whole-2
test_cases: 133 test_cases: 133
model: deepseek-coder V2 0724 model: deepseek-coder V2 0724
@ -271,27 +208,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 25.8 seconds_per_case: 25.8
total_cost: 0.0439 total_cost: 0.0439
- dirname: 2024-08-15-14-09-22--json-no-lint-deepseek-coder-func-2
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 5
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 6
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 18.8
total_cost: 0.0333
- dirname: 2024-08-15-14-11-45--json-no-lint-gpt-4o-2024-08-06-whole-3 - dirname: 2024-08-15-14-11-45--json-no-lint-gpt-4o-2024-08-06-whole-3
test_cases: 133 test_cases: 133
model: gpt-4o-2024-08-06 model: gpt-4o-2024-08-06
@ -376,27 +292,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 11.0 seconds_per_case: 11.0
total_cost: 1.6555 total_cost: 1.6555
- dirname: 2024-08-15-14-19-19--json-no-lint-claude-3.5-sonnet-func-3
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 51.1
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 10.3
total_cost: 1.5614
- dirname: 2024-08-15-14-21-06--json-no-lint-deepseek-coder-whole-3 - dirname: 2024-08-15-14-21-06--json-no-lint-deepseek-coder-whole-3
test_cases: 133 test_cases: 133
model: deepseek-coder V2 0724 model: deepseek-coder V2 0724
@ -418,27 +313,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 24.4 seconds_per_case: 24.4
total_cost: 0.0439 total_cost: 0.0439
- dirname: 2024-08-15-14-24-46--json-no-lint-deepseek-coder-func-3
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 52.6
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 12
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 19.0
total_cost: 0.0334
- dirname: 2024-08-15-14-27-17--json-no-lint-gpt-4o-2024-08-06-whole-4 - dirname: 2024-08-15-14-27-17--json-no-lint-gpt-4o-2024-08-06-whole-4
test_cases: 133 test_cases: 133
model: gpt-4o-2024-08-06 model: gpt-4o-2024-08-06
@ -523,27 +397,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 11.3 seconds_per_case: 11.3
total_cost: 1.6635 total_cost: 1.6635
- dirname: 2024-08-15-14-36-18--json-no-lint-claude-3.5-sonnet-func-4
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 55.6
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 10.5
total_cost: 1.5768
- dirname: 2024-08-15-14-38-35--json-no-lint-deepseek-coder-whole-4 - dirname: 2024-08-15-14-38-35--json-no-lint-deepseek-coder-whole-4
test_cases: 133 test_cases: 133
model: deepseek-coder V2 0724 model: deepseek-coder V2 0724
@ -565,27 +418,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 24.5 seconds_per_case: 24.5
total_cost: 0.0438 total_cost: 0.0438
- dirname: 2024-08-15-14-41-36--json-no-lint-deepseek-coder-func-4
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 49.6
percent_cases_well_formed: 100.0
error_outputs: 7
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 2
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 18.7
total_cost: 0.0333
- dirname: 2024-08-15-14-44-11--json-no-lint-gpt-4o-2024-08-06-whole-5 - dirname: 2024-08-15-14-44-11--json-no-lint-gpt-4o-2024-08-06-whole-5
test_cases: 133 test_cases: 133
model: gpt-4o-2024-08-06 model: gpt-4o-2024-08-06
@ -670,27 +502,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 11.4 seconds_per_case: 11.4
total_cost: 1.6685 total_cost: 1.6685
- dirname: 2024-08-15-14-52-48--json-no-lint-claude-3.5-sonnet-func-5
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 2
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 10.8
total_cost: 1.5786
- dirname: 2024-08-15-14-54-41--json-no-lint-deepseek-coder-whole-5 - dirname: 2024-08-15-14-54-41--json-no-lint-deepseek-coder-whole-5
test_cases: 133 test_cases: 133
model: deepseek-coder V2 0724 model: deepseek-coder V2 0724
@ -712,27 +523,6 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 24.5 seconds_per_case: 24.5
total_cost: 0.0439 total_cost: 0.0439
- dirname: 2024-08-15-14-57-51--json-no-lint-deepseek-coder-func-5
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: bac04a2
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 5
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 4
indentation_errors: 1
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 18.5
total_cost: 0.0330
- dirname: 2024-08-15-15-12-55--json-no-lint-strict-gpt-4o-2024-08-06-func-2 - dirname: 2024-08-15-15-12-55--json-no-lint-strict-gpt-4o-2024-08-06-func-2
test_cases: 133 test_cases: 133
model: gpt-4o-2024-08-06 model: gpt-4o-2024-08-06
@ -922,3 +712,216 @@
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 7.6 seconds_per_case: 7.6
total_cost: 1.4948 total_cost: 1.4948
- dirname: 2024-08-15-19-35-32--json-no-lint-again-deepseek-coder-func-1
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: 3a2ac02-dirty
pass_rate_1: 50.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 2
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 17.8
total_cost: 0.0330
- dirname: 2024-08-15-19-37-50--json-no-lint-again-deepseek-coder-func-2
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 49.6
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 5
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 18.3
total_cost: 0.0336
- dirname: 2024-08-15-19-40-20--json-no-lint-again-deepseek-coder-func-3
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 48.9
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 5
indentation_errors: 1
exhausted_context_windows: 1
test_timeouts: 2
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 18.4
total_cost: 0.0337
- dirname: 2024-08-15-19-44-07--json-no-lint-again-deepseek-coder-func-4
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 2
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 17.6
total_cost: 0.0330
- dirname: 2024-08-15-19-46-48--json-no-lint-again-deepseek-coder-func-5
test_cases: 133
model: deepseek-coder V2 0724
edit_format: JSON
commit_hash: 1a98c28-dirty
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 11
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
command: aider --model deepseek-coder
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 18.0
total_cost: 0.0332
- dirname: 2024-08-15-20-07-59--json-no-lint-again-claude-3.5-sonnet-func-1
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 54.1
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 9.5
total_cost: 1.5789
- dirname: 2024-08-15-20-09-39--json-no-lint-again-claude-3.5-sonnet-func-2
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 55.6
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 9.2
total_cost: 1.5916
- dirname: 2024-08-15-20-11-39--json-no-lint-again-claude-3.5-sonnet-func-3
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 53.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 10.3
total_cost: 1.5896
- dirname: 2024-08-15-20-13-44--json-no-lint-again-claude-3.5-sonnet-func-4
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 55.6
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 9.2
total_cost: 1.6000
- dirname: 2024-08-15-20-15-51--json-no-lint-again-claude-3.5-sonnet-func-5
test_cases: 133
model: claude-3.5-sonnet
edit_format: JSON
commit_hash: 1a98c28
pass_rate_1: 51.9
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
command: aider --model claude-3.5-sonnet
date: 2024-08-15
versions: 0.50.2-dev
seconds_per_case: 8.9
total_cost: 1.5936

View file

@ -1,11 +1,21 @@
<div id="chartContainer" style="position: relative; height: 0; padding-bottom: 50%; margin-bottom: 20px;"> <style>
<canvas id="passRateChart" style="position: absolute; width: 100%; height: 100%;"></canvas> .chart-container {
position: relative;
width: 100%;
max-width: 800px;
margin: 0 auto;
}
</style>
<div class="chart-container">
<canvas id="passRateChart"></canvas>
</div> </div>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script> <script>
document.addEventListener('DOMContentLoaded', function () { document.addEventListener('DOMContentLoaded', function () {
var ctx = document.getElementById('passRateChart').getContext('2d'); var ctx = document.getElementById('passRateChart').getContext('2d');
var chartContainer = document.querySelector('.chart-container');
var yamlData = {{ site.data.code-in-json | jsonify }}; var yamlData = {{ site.data.code-in-json | jsonify }};
@ -39,12 +49,19 @@ document.addEventListener('DOMContentLoaded', function () {
datasets: datasets datasets: datasets
}; };
function getAspectRatio() {
var width = chartContainer.offsetWidth;
// Gradually change aspect ratio from 2 (landscape) to 1 (square)
return Math.max(1, Math.min(2, width / 300));
}
var config = { var config = {
type: 'bar', type: 'bar',
data: data, data: data,
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: true,
aspectRatio: getAspectRatio(),
scales: { scales: {
x: { x: {
title: { title: {
@ -64,7 +81,7 @@ document.addEventListener('DOMContentLoaded', function () {
plugins: { plugins: {
title: { title: {
display: true, display: true,
text: 'Pass rate by model and code wrapping strategy', text: 'Coding skill by model and code wrapping strategy',
font: { font: {
size: 16 size: 16
} }
@ -91,6 +108,8 @@ document.addEventListener('DOMContentLoaded', function () {
plugins: [{ plugins: [{
afterDraw: function(chart) { afterDraw: function(chart) {
var ctx = chart.ctx; var ctx = chart.ctx;
var isWideScreen = window.innerWidth > 768; // Assuming 768px as the breakpoint for wide screens
if (isWideScreen) {
chart.data.datasets.forEach(function(dataset, i) { chart.data.datasets.forEach(function(dataset, i) {
var meta = chart.getDatasetMeta(i); var meta = chart.getDatasetMeta(i);
meta.data.forEach(function(bar, index) { meta.data.forEach(function(bar, index) {
@ -99,51 +118,27 @@ document.addEventListener('DOMContentLoaded', function () {
ctx.fillStyle = '#000000'; ctx.fillStyle = '#000000';
ctx.textAlign = 'center'; ctx.textAlign = 'center';
ctx.textBaseline = 'bottom'; ctx.textBaseline = 'bottom';
ctx.fillText(data.toFixed(1) + '%', bar.x, bar.y - 5); var displayText = data.toFixed(1) + '%';
ctx.fillText(displayText, bar.x, bar.y - 5);
} }
}); });
}); });
} }
}
}] }]
}; };
// Adjust chart height based on screen width var chart = new Chart(ctx, config);
function adjustChartHeight() {
var container = document.getElementById('chartContainer'); function resizeChart() {
if (window.innerWidth < 600) { chart.options.aspectRatio = getAspectRatio();
container.style.paddingBottom = '75%'; // Increase height on small screens chart.resize();
} else {
container.style.paddingBottom = '50%'; // Default height
}
} }
// Call the function initially and on window resize window.addEventListener('resize', resizeChart);
adjustChartHeight();
window.addEventListener('resize', adjustChartHeight);
function createStripedCanvas(isStrict) { // Initial resize to set correct size
const patternCanvas = document.createElement('canvas'); resizeChart();
const patternContext = patternCanvas.getContext('2d');
const size = 10;
patternCanvas.width = size;
patternCanvas.height = size;
patternContext.fillStyle = 'rgba(255, 99, 132, 0.8)';
patternContext.fillRect(0, 0, size, size);
if (isStrict) {
patternContext.strokeStyle = 'rgba(255, 255, 255, 0.8)';
patternContext.lineWidth = 0.75;
patternContext.beginPath();
patternContext.moveTo(0, 0);
patternContext.lineTo(size, size);
patternContext.stroke();
}
return patternCanvas;
}
new Chart(ctx, config);
}); });
function createStripedCanvas(isStrict) { function createStripedCanvas(isStrict) {

View file

@ -1,10 +1,21 @@
<div id="syntaxErrorsContainer" style="position: relative; height: 0; padding-bottom: 50%; margin-bottom: 20px;"> <style>
<canvas id="syntaxErrorsChart" style="position: absolute; width: 100%; height: 100%;"></canvas> .chart-container {
position: relative;
width: 100%;
max-width: 800px;
margin: 0 auto;
}
</style>
<div class="chart-container">
<canvas id="syntaxErrorsChart"></canvas>
</div> </div>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script> <script>
document.addEventListener('DOMContentLoaded', function () { document.addEventListener('DOMContentLoaded', function () {
var ctx = document.getElementById('syntaxErrorsChart').getContext('2d'); var ctx = document.getElementById('syntaxErrorsChart').getContext('2d');
var chartContainer = document.querySelector('.chart-container');
var yamlData = {{ site.data.code-in-json | jsonify }}; var yamlData = {{ site.data.code-in-json | jsonify }};
@ -38,12 +49,19 @@ document.addEventListener('DOMContentLoaded', function () {
datasets: datasets datasets: datasets
}; };
function getAspectRatio() {
var width = chartContainer.offsetWidth;
// Gradually change aspect ratio from 2 (landscape) to 1 (square)
return Math.max(1, Math.min(2, width / 300));
}
var config = { var config = {
type: 'bar', type: 'bar',
data: data, data: data,
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: true,
aspectRatio: getAspectRatio(),
scales: { scales: {
x: { x: {
title: { title: {
@ -106,20 +124,16 @@ document.addEventListener('DOMContentLoaded', function () {
}] }]
}; };
// Adjust chart height based on screen width var chart = new Chart(ctx, config);
function adjustChartHeight() {
var container = document.getElementById('syntaxErrorsContainer'); function resizeChart() {
if (window.innerWidth < 600) { chart.options.aspectRatio = getAspectRatio();
container.style.paddingBottom = '75%'; // Increase height on small screens chart.resize();
} else {
container.style.paddingBottom = '50%'; // Default height
}
} }
// Call the function initially and on window resize window.addEventListener('resize', resizeChart);
adjustChartHeight();
window.addEventListener('resize', adjustChartHeight);
new Chart(ctx, config); // Initial resize to set correct size
resizeChart();
}); });
</script> </script>

View file

@ -1,8 +1,7 @@
--- ---
title: LLMs are bad at returning code in JSON title: LLMs are bad at returning code in JSON
excerpt: LLMs write worse code if you ask them to return the code wrapped in JSON (via a tool or function call). excerpt: LLMs write worse code if you ask them to return the code wrapped in JSON via a tool function call.
highlight_image: /assets/code-in-json.jpg highlight_image: /assets/code-in-json.jpg
draft: true
nav_exclude: true nav_exclude: true
--- ---
{% if page.date %} {% if page.date %}
@ -12,35 +11,24 @@ nav_exclude: true
# LLMs are bad at returning code in JSON # LLMs are bad at returning code in JSON
Current LLMs have support for returning properly formatted JSON, LLMs produce lower quality code if theyre asked to return it as part of a structured JSON response. This seems to be true for many top models, including those with specialized support for JSON. Benchmarks show that models struggle with syntactic issues related to quoting and escaping.
making it easier for clients to reliably parse complex responses. The benchmark results also imply a decreased capacity for solving coding problems due to the burden of JSON formatting.
It therefore seems attractive for
AI coding applications ask LLMs to return code in structured JSON replies.
Unfortunately,
LLMs write worse code when asked to wrap it in JSON, harming their ability
to correctly solve coding tasks.
On a variant of the aider code editing benchmark,
asking for JSON-wrapped code
often harms coding performance.
This holds true across many top coding LLMs,
including OpenAI's latest model gpt-4o-2024-08-06
which has strong JSON support.
{% include code-in-json-benchmark.js %} {% include code-in-json-benchmark.js %}
> Figure 1: Benchmark scores of models using either plain markdown text or JSON to return code, > Figure 1: Aider coding benchmark scores of models using either plain markdown text or JSON to return code.
> averaged over 5 runs. > Pass rate (%) averaged over 5 runs.
> Models produce better code when they return it as plain markdown text, as compared to wrapping it in JSON for a tool function call. > Models produce better code when they return it as markdown text,
> as compared to returning code in a structured JSON response.
## Background ## Background
A lot of people wonder why aider doesn't use LLM tools for code editing. People often ask why aider uses a plain text format for LLMs to specify code edits (below),
Instead, aider asks for code edits in plain text, like this: rather than relying on LLM tools and structured JSON responses.
```` ```python
greeting.py greeting.py
```
<<<<<<< SEARCH <<<<<<< SEARCH
def greeting(): def greeting():
print("Hello") print("Hello")
@ -49,10 +37,9 @@ def greeting():
print("Goodbye") print("Goodbye")
>>>>>>> REPLACE >>>>>>> REPLACE
``` ```
````
People expect that it would be easier and more reliable to use tool calls, People expect that it would be easier and more reliable to use tool calls,
which would return a structured JSON response: which would involve a structured JSON response more like this:
```json ```json
{ {
@ -62,7 +49,7 @@ which would return a structured JSON response:
} }
``` ```
This has become even more tempting as LLM providers This question becomes increasingly relevant as LLM providers
continue to improve their tooling for reliably generating JSON. continue to improve their tooling for reliably generating JSON.
For example, For example,
[OpenAI recently announced](https://openai.com/index/introducing-structured-outputs-in-the-api/) [OpenAI recently announced](https://openai.com/index/introducing-structured-outputs-in-the-api/)
@ -70,10 +57,9 @@ the ability to
strictly enforce that JSON responses will be syntactically correct strictly enforce that JSON responses will be syntactically correct
and conform to a specified schema. and conform to a specified schema.
But just producing valid JSON is not sufficient for AI code generation --
But producing valid (schema compliant) JSON is not sufficient for working with AI generated code. the code inside the JSON matters too.
The code inside the JSON has to correctly solve the requested task It has to be high quality code that solves the assigned coding task without errors or bugs.
and be free from syntax errors.
Unfortunately, Unfortunately,
LLMs write worse code when they're asked to LLMs write worse code when they're asked to
wrap it in JSON. wrap it in JSON.
@ -92,7 +78,7 @@ own escape sequences.
Would *you* write better code by Would *you* write better code by
typing it out normally typing it out normally
or as a properly escaped or typing it as a properly escaped
JSON string? JSON string?
@ -103,28 +89,30 @@ showed
the superiority of returning code the superiority of returning code
as plain text compared to JSON-wrapped function calls. as plain text compared to JSON-wrapped function calls.
Those results were obtained Those results were obtained
over a year ago, against far less over a year ago, against models far less capable than those available today.
capable models. OpenAI's newly announced support for "strict" JSON
OpenAI's newly announced support for "strict" JSON seemed like a good reason to suggests the possibility that modern models might be able
investigate whether the newest models are still handicapped by JSON-wrapping code. to return quality code inside a structured JSON response.
The results presented here were based on The results presented here are based on
the the
[aider "code editing" benchmark](/2023/07/02/benchmarks.html#the-benchmark) [aider "code editing" benchmark](/2023/07/02/benchmarks.html#the-benchmark)
of 133 practice exercises from the Exercism python repository. of 133 practice exercises from the Exercism python repository.
Models were The benchmark was simplified somewhat to focus on the differences between
restricted to a single attempt, plain text and JSON responses.
without a second try to fix errors as is normal in the aider benchmark. In particular, models were
restricted to a single attempt to solve each task
without a second try to fix errors.
The performance of each model was compared across different strategies for returning code: The performance of each model was compared across different strategies for returning code:
- **Markdown** -- the model returned the whole source code file in standard markdown triple-backtick fences. - **Markdown** -- the model returned the whole source code file in standard markdown triple-backtick fences.
- **JSON** -- the model used a tool function call to return the whole source code file. This required the LLM to wrap the code in JSON. - **JSON** -- the model used a tool function call to return the whole source code file in a structured JSON response.
- **JSON (strict)** -- the same as the "JSON" strategy, but with `strict=True`. Only gpt-4o-2024-08-06 supports this setting. - **JSON (strict)** -- the same as the "JSON" strategy, but with `strict=True`. Only gpt-4o-2024-08-06 supported this setting.
The markdown strategy is the same as The markdown strategy was the same as
aider's "whole" edit format, where the aider's "whole" edit format, where the
LLM returns a source file like this: LLM returns an entire updated copy of the source file like this:
```` ````
Here is the program you asked for which prints "Hello": Here is the program you asked for which prints "Hello":
@ -136,9 +124,10 @@ def greeting():
``` ```
```` ````
The JSON and JSON (strict) strategies required the LLM to call the `write_file` function with Both JSON strategies required the LLM to call the `write_file` function with
two parameters, as shown below. an explanation/plan and
For maximum simplicity, the LLM didn't have to specify the filename, the entire updated copy of the source file.
The LLM didn't have to specify the filename,
since the benchmark operates on one source file at a time. since the benchmark operates on one source file at a time.
```json ```json
@ -148,16 +137,7 @@ since the benchmark operates on one source file at a time.
} }
``` ```
These strategies avoid actually *editing* source files, to keep This experimental setup was designed to quantify
the task as
simple as possible.
The LLM is able to emit the whole source file intact,
which is much easier
than correctly formulating
instructions to edit
portions of a file.
This experimental setup is designed to quantify
the effects of JSON-wrapping on the LLMs ability to write code to solve a task. the effects of JSON-wrapping on the LLMs ability to write code to solve a task.
## Results ## Results
@ -172,38 +152,60 @@ to assess the impact of JSON-wrapping code:
Each combination of model and code wrapping strategy was benchmarked 5 times. Each combination of model and code wrapping strategy was benchmarked 5 times.
## Overall coding skill ### Overall coding skill
As shown in Figure 1, As shown in Figure 1,
all of the models did worse on the benchmark when asked to all of the models did worse on the benchmark when asked to
return JSON-wrapped code in a tool function call. return code in a structured JSON response.
Most did significantly worse, performing far below Most did significantly worse, performing well below
the result obtained with the markdown strategy. their result with the markdown strategy.
Some noteworthy observations: Some noteworthy observations:
- OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were - OpenAI's gpt-4o-2024-05-13 was the only model where the markdown and JSON results were
close. Using JSON only dropped the score by 0.3 percent, a difference which is close. Using JSON only dropped the score by 0.4 percent, a difference which is
within the margin of error for 5 trials. within the margin of error for 5 trials.
- The use of OpenAI's new strict mode offered no improvement - The use of OpenAI's new strict mode offered no improvement
as compared to non-strict JSON. as compared to non-strict JSON.
Of course, both JSON results were well below the markdown result. Both JSON results were well below the markdown result.
- The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping. - The results from Sonnet and DeepSeek Coder suffered the worst harm from JSON wrapping.
## Syntax errors ### Syntax errors
Models tend to make more syntax errors when asked to wrap code in JSON. Models tend to make more syntax errors when asked to wrap code in JSON.
Figure 2 shows the number of syntax errors found in the code produced by each Figure 2 shows the number of syntax errors found in the code produced by each
model and code wrapping strategy, model and code wrapping strategy.
totaling up `SyntaxError` and `IndentationError` errors from all 5 runs. It totals up the `SyntaxError` and `IndentationError` errors from all 5 runs,
for each model and strategy combination.
Below is an example of a `SyntaxError` created by gpt-4o-2024-05-13 using the
JSON code wrapping strategy.
It appears that the model got confused about escaping and quoting while trying
to format the JSON response.
```python
Traceback (most recent call last):
...
File "bottle-song/bottle_song.py", line 9
lyrics.append(f'There'll be {i - 1} green bottles hanging on the wall.')
^
SyntaxError: unterminated string literal (detected at line 9)
```
The problematic line of code contains a single-quoted string which also
contains a single-quote character.
It should have been output as the following chunk of JSON, with
a double slash in `There\\'ll`.
That is needed to JSON-escape the `\` so that it survives
JSON-decoding to
produce `There\'ll` in the resulting code.
That would correctly escape the single-quote inside the single-quoted string.
```
...lyrics.append(f'There\\'ll be {i - 1} green bottles hanging on the wall.')\n...
```
Sonnet's results seems to indicate that the negative effects of JSON-wrapping
go beyond syntactic difficulties.
Sonnet avoided syntax errors regardless of the code wrapping strategy,
but its benchmark scores in Figure 1 were nonetheless lower with JSON.
This implies that JSON-wrapping may distract or challenge models in a way that
reduces their ability to reason about solving coding problems.
{% include code-in-json-syntax.js %} {% include code-in-json-syntax.js %}
@ -211,23 +213,36 @@ reduces their ability to reason about solving coding problems.
> totaled from 5 runs. > totaled from 5 runs.
> Models tend to make more syntax and formatting errors when asked to wrap code in JSON. > Models tend to make more syntax and formatting errors when asked to wrap code in JSON.
### Beyond syntax errors
Sonnet's results seems to indicate that the negative effects of JSON-wrapping
go beyond just syntactic difficulties.
Sonnet avoided syntax errors regardless of the code wrapping strategy,
but its benchmark scores in Figure 1 were nonetheless lower with JSON.
This implies that JSON-wrapping may distract or challenge models in a way that
reduces their ability to reason about solving coding problems.
## Conclusions ## Conclusions
While the quantitative results differ from the similar While the specific results differ from the similar
[July 2023 experiments](/2023/07/02/benchmarks.html), [July 2023 experiments](/2023/07/02/benchmarks.html),
the conclusion seems unchanged: LLMs are bad at returning code in JSON. the conclusion remains unchanged: LLMs are bad at returning code in
structured JSON responses.
OpenAI appears to be making progress in allowing LLMs to return code in OpenAI appears to be making progress in allowing LLMs to
structured JSON responses without harming the code quality. return JSON-wrapped code
But it still seems premature to consider switching from plain text without harming the code quality.
to JSON-wrapped code. But it seems premature to consider switching from plain text
to JSON-wrapped code at this time.
---------
## Notes on the aider leaderboard #### Notes on the aider leaderboard
The results presented here are not directly comparable to results *The results presented here are not directly comparable to results
from the main from the main
[aider LLM leaderboard](https://aider.chat/docs/leaderboards/). [aider LLM leaderboard](https://aider.chat/docs/leaderboards/).
A number of settings were changed to simplify the benchmark A number of settings were changed to simplify the benchmark
in order to focus on comparing plain text and JSON-wrapped code. in order to focus on comparing plain text and JSON-wrapped code.*

Binary file not shown.

After

Width:  |  Height:  |  Size: 158 KiB

View file

@ -27,7 +27,7 @@ The json file should be a dictionary with an entry for each model, as follows:
``` ```
{ {
"deepseek-chat": { "deepseek/deepseek-chat": {
"max_tokens": 4096, "max_tokens": 4096,
"max_input_tokens": 32000, "max_input_tokens": 32000,
"max_output_tokens": 4096, "max_output_tokens": 4096,
@ -42,6 +42,11 @@ The json file should be a dictionary with an entry for each model, as follows:
See See
[litellm's model_prices_and_context_window.json file](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) for more examples. [litellm's model_prices_and_context_window.json file](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) for more examples.
{: .tip }
Use a fully qualified model name with a `provider/` at the front
in the `.aider.model.metadata.json` file.
For example, use `deepseek/deepseek-chat`, not just `deepseek-chat`.
## Model settings ## Model settings
Aider has a number of settings that control how it works with Aider has a number of settings that control how it works with

View file

@ -1,3 +1,4 @@
import json
import os import os
import subprocess import subprocess
import tempfile import tempfile
@ -489,3 +490,27 @@ class TestMain(TestCase):
self.assertIn(real_external_file_path, coder.abs_read_only_fnames) self.assertIn(real_external_file_path, coder.abs_read_only_fnames)
finally: finally:
os.unlink(external_file_path) os.unlink(external_file_path)
def test_model_metadata_file(self):
with GitTemporaryDirectory():
metadata_file = Path(".aider.model.metadata.json")
# must be a fully qualified model name: provider/...
metadata_content = {"deepseek/deepseek-chat": {"max_input_tokens": 1234}}
metadata_file.write_text(json.dumps(metadata_content))
coder = main(
[
"--model",
"deepseek/deepseek-chat",
"--model-metadata-file",
str(metadata_file),
"--exit",
"--yes",
],
input=DummyInput(),
output=DummyOutput(),
return_coder=True,
)
self.assertEqual(coder.main_model.info["max_input_tokens"], 1234)