This commit is contained in:
Paul Gauthier 2024-08-14 16:50:14 -07:00
parent 7310f0928f
commit b3ed2c8a48
2 changed files with 62 additions and 61 deletions

View file

@ -1,3 +1,25 @@
- dirname: 2024-08-14-18-26-18--json-gpt-4o-2024-08-06-whole
test_cases: 133
model: gpt-4o-2024-08-06
edit_format: Markdown
commit_hash: 94a2601-dirty
pass_rate_1: 62.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model gpt-4o-2024-08-06
date: 2024-08-14
versions: 0.50.2-dev
seconds_per_case: 6.8
total_cost: 1.2717
- dirname: 2024-08-14-18-38-25--json-gpt-4o-2024-08-06-non-strict-func - dirname: 2024-08-14-18-38-25--json-gpt-4o-2024-08-06-non-strict-func
test_cases: 133 test_cases: 133
model: gpt-4o-2024-08-06 model: gpt-4o-2024-08-06
@ -42,53 +64,9 @@
seconds_per_case: 12.7 seconds_per_case: 12.7
total_cost: 1.3652 total_cost: 1.3652
- dirname: 2024-08-14-18-26-18--json-gpt-4o-2024-08-06-whole
test_cases: 133
model: gpt-4o-2024-08-06
edit_format: Markdown
commit_hash: 94a2601-dirty
pass_rate_1: 62.4
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 0
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
command: aider --model gpt-4o-2024-08-06
date: 2024-08-14
versions: 0.50.2-dev
seconds_per_case: 6.8
total_cost: 1.2717
- dirname: 2024-08-14-20-19-23--json-sonnet-non-strict-func
test_cases: 133
model: openrouter/anthropic/claude-3.5-sonnet
edit_format: Tool call
commit_hash: e2f14a2
pass_rate_1: 52.6
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 1
lazy_comments: 0
syntax_errors: 1
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model openrouter/anthropic/claude-3.5-sonnet
date: 2024-08-14
versions: 0.50.2-dev
seconds_per_case: 18.9
total_cost: 2.6341
- dirname: 2024-08-14-20-15-19--json-sonnet-whole - dirname: 2024-08-14-20-15-19--json-sonnet-whole
test_cases: 133 test_cases: 133
model: openrouter/anthropic/claude-3.5-sonnet model: claude-3.5-sonnet
edit_format: Markdown edit_format: Markdown
commit_hash: e2f14a2 commit_hash: e2f14a2
pass_rate_1: 58.6 pass_rate_1: 58.6
@ -102,37 +80,37 @@
indentation_errors: 0 indentation_errors: 0
exhausted_context_windows: 0 exhausted_context_windows: 0
test_timeouts: 0 test_timeouts: 0
command: aider --model openrouter/anthropic/claude-3.5-sonnet command: aider --model claude-3.5-sonnet
date: 2024-08-14 date: 2024-08-14
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 19.7 seconds_per_case: 19.7
total_cost: 2.5335 total_cost: 2.5335
- dirname: 2024-08-14-21-20-46--json-deepseek-non-strict-func - dirname: 2024-08-14-20-19-23--json-sonnet-non-strict-func
test_cases: 133 test_cases: 133
model: openrouter/deepseek/deepseek-coder model: claude-3.5-sonnet
edit_format: Tool call edit_format: Tool call
commit_hash: e2f14a2 commit_hash: e2f14a2
pass_rate_1: 54.1 pass_rate_1: 52.6
percent_cases_well_formed: 100.0 percent_cases_well_formed: 100.0
error_outputs: 9 error_outputs: 1
num_malformed_responses: 0 num_malformed_responses: 0
num_with_malformed_responses: 0 num_with_malformed_responses: 0
user_asks: 5 user_asks: 1
lazy_comments: 0 lazy_comments: 0
syntax_errors: 2 syntax_errors: 1
indentation_errors: 0 indentation_errors: 0
exhausted_context_windows: 0 exhausted_context_windows: 0
test_timeouts: 0 test_timeouts: 0
command: aider --model openrouter/deepseek/deepseek-coder command: aider --model claude-3.5-sonnet
date: 2024-08-14 date: 2024-08-14
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 17.4 seconds_per_case: 18.9
total_cost: 0.0332 total_cost: 2.6341
- dirname: 2024-08-14-21-23-27--json-deepseek-whole - dirname: 2024-08-14-21-23-27--json-deepseek-whole
test_cases: 133 test_cases: 133
model: openrouter/deepseek/deepseek-coder model: deepseek-coder
edit_format: Markdown edit_format: Markdown
commit_hash: e2f14a2 commit_hash: e2f14a2
pass_rate_1: 61.7 pass_rate_1: 61.7
@ -146,9 +124,31 @@
indentation_errors: 0 indentation_errors: 0
exhausted_context_windows: 0 exhausted_context_windows: 0
test_timeouts: 1 test_timeouts: 1
command: aider --model openrouter/deepseek/deepseek-coder command: aider --model deepseek-coder
date: 2024-08-14 date: 2024-08-14
versions: 0.50.2-dev versions: 0.50.2-dev
seconds_per_case: 23.0 seconds_per_case: 23.0
total_cost: 0.0439 total_cost: 0.0439
- dirname: 2024-08-14-21-20-46--json-deepseek-non-strict-func
test_cases: 133
model: deepseek-coder
edit_format: Tool call
commit_hash: e2f14a2
pass_rate_1: 54.1
percent_cases_well_formed: 100.0
error_outputs: 9
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 5
lazy_comments: 0
syntax_errors: 2
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
command: aider --model deepseek-coder
date: 2024-08-14
versions: 0.50.2-dev
seconds_per_case: 17.4
total_cost: 0.0332

View file

@ -9,6 +9,9 @@ nav_exclude: true
<p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p> <p class="post-date">{{ page.date | date: "%B %d, %Y" }}</p>
{% endif %} {% endif %}
# LLMs are bad at returning code in json
<canvas id="passRateChart" width="800" height="400" style="margin-bottom: 20px"></canvas> <canvas id="passRateChart" width="800" height="400" style="margin-bottom: 20px"></canvas>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
@ -55,13 +58,13 @@ document.addEventListener('DOMContentLoaded', function () {
display: true, display: true,
text: 'Pass Rate (%)' text: 'Pass Rate (%)'
}, },
max: 100 max: 70
} }
}, },
plugins: { plugins: {
title: { title: {
display: true, display: true,
text: 'Pass Rate by Model and Edit Format', text: 'Pass rate by model and code return strategy',
font: { font: {
size: 16 size: 16
} }
@ -77,8 +80,6 @@ document.addEventListener('DOMContentLoaded', function () {
}); });
</script> </script>
# LLMs are bad at returning code in json
A lot of people wonder why aider doesn't have LLMs use tools or function calls to A lot of people wonder why aider doesn't have LLMs use tools or function calls to
specify code edits. specify code edits.