aider/aider/website/_data/polyglot_leaderboard.yml

1226 lines
No EOL
29 KiB
YAML

- dirname: 2025-02-25-20-23-07--gemini-pro
test_cases: 225
model: Gemini 2.0 Pro exp-02-05
edit_format: whole
commit_hash: 2fccd47
pass_rate_1: 20.4
pass_rate_2: 35.6
pass_num_1: 46
pass_num_2: 80
percent_cases_well_formed: 100.0
error_outputs: 430
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 13
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 5
total_tests: 225
command: aider --model gemini/gemini-2.0-pro-exp-02-05
date: 2025-02-25
versions: 0.75.2.dev
seconds_per_case: 34.8
total_cost: 0.0000
- dirname: 2024-12-21-18-41-18--polyglot-gpt-4o-mini
test_cases: 225
model: gpt-4o-mini-2024-07-18
edit_format: whole
commit_hash: a755079-dirty
pass_rate_1: 0.9
pass_rate_2: 3.6
pass_num_1: 2
pass_num_2: 8
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 36
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model gpt-4o-mini-2024-07-18
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 17.3
total_cost: 0.3236
- dirname: 2025-01-17-19-44-33--sonnet-baseline-jan-17
test_cases: 225
model: claude-3-5-sonnet-20241022
edit_format: diff
commit_hash: 6451d59
pass_rate_1: 22.2
pass_rate_2: 51.6
pass_num_1: 50
pass_num_2: 116
percent_cases_well_formed: 99.6
error_outputs: 2
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 11
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 8
total_tests: 225
command: aider --model claude-3-5-sonnet-20241022
date: 2025-01-17
versions: 0.71.2.dev
seconds_per_case: 21.4
total_cost: 14.4063
- dirname: 2024-12-30-20-57-12--gpt-4o-2024-11-20-ex-as-sys
test_cases: 225
model: gpt-4o-2024-11-20
edit_format: diff
commit_hash: 09ee197-dirty
pass_rate_1: 4.9
pass_rate_2: 18.2
pass_num_1: 11
pass_num_2: 41
percent_cases_well_formed: 95.1
error_outputs: 12
num_malformed_responses: 12
num_with_malformed_responses: 11
user_asks: 53
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 12
total_tests: 225
command: aider --model gpt-4o-2024-11-20
date: 2024-12-30
versions: 0.70.1.dev
seconds_per_case: 12.1
total_cost: 6.7351
- dirname: 2024-12-30-20-44-54--gpt4o-ex-as-sys-clean-prompt
test_cases: 225
model: gpt-4o-2024-08-06
edit_format: diff
commit_hash: 09ee197-dirty
pass_rate_1: 4.9
pass_rate_2: 23.1
pass_num_1: 11
pass_num_2: 52
percent_cases_well_formed: 94.2
error_outputs: 21
num_malformed_responses: 21
num_with_malformed_responses: 13
user_asks: 65
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model gpt-4o-2024-08-06
date: 2024-12-30
versions: 0.70.1.dev
seconds_per_case: 16.0
total_cost: 7.0286
- dirname: 2024-12-21-19-23-03--polyglot-o1-hard-diff
test_cases: 224
model: o1-2024-12-17 (high)
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 23.7
pass_rate_2: 61.7
pass_num_1: 53
pass_num_2: 139
percent_cases_well_formed: 91.5
error_outputs: 25
num_malformed_responses: 24
num_with_malformed_responses: 19
user_asks: 16
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model openrouter/openai/o1
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 133.2
total_cost: 186.4958
- dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff
test_cases: 225
model: DeepSeek Chat V2.5
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 5.3
pass_rate_2: 17.8
pass_num_1: 12
pass_num_2: 40
percent_cases_well_formed: 92.9
error_outputs: 42
num_malformed_responses: 37
num_with_malformed_responses: 16
user_asks: 23
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 5
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 184.0
total_cost: 0.5101
- dirname: 2024-12-21-21-46-27--polyglot-haiku-diff
test_cases: 225
model: claude-3-5-haiku-20241022
edit_format: diff
commit_hash: a755079-dirty
pass_rate_1: 7.1
pass_rate_2: 28.0
pass_num_1: 16
pass_num_2: 63
percent_cases_well_formed: 91.1
error_outputs: 31
num_malformed_responses: 30
num_with_malformed_responses: 20
user_asks: 13
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 9
total_tests: 225
command: aider --model claude-3-5-haiku-20241022
date: 2024-12-21
versions: 0.69.2.dev
seconds_per_case: 31.8
total_cost: 6.0583
- dirname: 2024-12-22-13-22-32--polyglot-qwen-diff
test_cases: 225
model: Qwen2.5-Coder-32B-Instruct
edit_format: diff
commit_hash: 6d7e8be-dirty
pass_rate_1: 4.4
pass_rate_2: 8.0
pass_num_1: 10
pass_num_2: 18
percent_cases_well_formed: 71.6
error_outputs: 158
num_malformed_responses: 148
num_with_malformed_responses: 64
user_asks: 132
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: "aider --model openai/Qwen/Qwen2.5-Coder-32B-Instruct # via hyperbolic"
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 84.4
total_cost: 0.0000
- dirname: 2024-12-22-21-26-35--polyglot-o1mini-whole
test_cases: 225
model: o1-mini-2024-09-12
edit_format: whole
commit_hash: 37df899
pass_rate_1: 5.8
pass_rate_2: 32.9
pass_num_1: 13
pass_num_2: 74
percent_cases_well_formed: 96.9
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 7
user_asks: 27
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model o1-mini
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 34.7
total_cost: 18.5770
- dirname: 2024-12-22-18-43-25--gemini-exp-1206-polyglot-whole-2
test_cases: 225
model: gemini-exp-1206
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 19.6
pass_rate_2: 38.2
pass_num_1: 44
pass_num_2: 86
percent_cases_well_formed: 98.2
error_outputs: 8
num_malformed_responses: 8
num_with_malformed_responses: 4
user_asks: 32
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 9
total_tests: 225
command: aider --model gemini/gemini-exp-1206
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 45.5
total_cost: 0.0000
- dirname: 2024-12-22-20-08-13--gemini-2.0-flash-exp-polyglot-whole
test_cases: 225
model: gemini-2.0-flash-exp
edit_format: whole
commit_hash: b1bc2f8
pass_rate_1: 11.6
pass_rate_2: 22.2
pass_num_1: 26
pass_num_2: 50
percent_cases_well_formed: 100.0
error_outputs: 1
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 9
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 8
total_tests: 225
command: aider --model gemini/gemini-2.0-flash-exp
date: 2024-12-22
versions: 0.69.2.dev
seconds_per_case: 12.2
total_cost: 0.0000
- dirname: 2024-12-23-01-11-56--yi-test
test_cases: 225
model: yi-lightning
edit_format: whole
commit_hash: 2b1625e
pass_rate_1: 5.8
pass_rate_2: 12.9
pass_num_1: 13
pass_num_2: 29
percent_cases_well_formed: 92.9
error_outputs: 87
num_malformed_responses: 72
num_with_malformed_responses: 16
user_asks: 107
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 6
total_tests: 225
command: aider --model openai/yi-lightning
date: 2024-12-23
versions: 0.69.2.dev
seconds_per_case: 146.7
total_cost: 0.0000
- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2
test_cases: 225
model: DeepSeek Chat V3 (prev)
edit_format: diff
commit_hash: 0a23c4a-dirty
pass_rate_1: 22.7
pass_rate_2: 48.4
pass_num_1: 51
pass_num_2: 109
percent_cases_well_formed: 98.7
error_outputs: 7
num_malformed_responses: 7
num_with_malformed_responses: 3
user_asks: 19
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 8
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2024-12-25
versions: 0.69.2.dev
seconds_per_case: 34.8
total_cost: 0.3369
- dirname: 2024-12-26-00-55-20--Qwen2.5-Coder-32B-Instruct
test_cases: 225
model: Qwen2.5-Coder-32B-Instruct
edit_format: whole
commit_hash: b51768b0
pass_rate_1: 4.9
pass_rate_2: 16.4
pass_num_1: 11
pass_num_2: 37
percent_cases_well_formed: 99.6
error_outputs: 1
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 33
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 6
total_tests: 225
command: aider --model openai/Qwen2.5-Coder-32B-Instruct
date: 2024-12-26
versions: 0.69.2.dev
seconds_per_case: 42.0
total_cost: 0.0000
- dirname: 2025-01-13-18-17-25--codestral-whole2
test_cases: 225
model: Codestral 25.01
edit_format: whole
commit_hash: 0cba898-dirty
pass_rate_1: 4.0
pass_rate_2: 11.1
pass_num_1: 9
pass_num_2: 25
percent_cases_well_formed: 100.0
error_outputs: 0
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 47
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 4
total_tests: 225
command: aider --model mistral/codestral-latest
date: 2025-01-13
versions: 0.71.2.dev
seconds_per_case: 9.3
total_cost: 1.9834
- dirname: 2025-01-20-19-11-38--ds-turns-upd-cur-msgs-fix-with-summarizer
test_cases: 225
model: DeepSeek R1
edit_format: diff
commit_hash: 5650697-dirty
pass_rate_1: 26.7
pass_rate_2: 56.9
pass_num_1: 60
pass_num_2: 128
percent_cases_well_formed: 96.9
error_outputs: 8
num_malformed_responses: 7
num_with_malformed_responses: 7
user_asks: 15
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 5
total_tests: 225
command: aider --model deepseek/deepseek-reasoner
date: 2025-01-20
versions: 0.71.2.dev
seconds_per_case: 113.7
total_cost: 5.4193
- dirname: 2025-01-23-19-14-48--r1-architect-sonnet
test_cases: 225
model: DeepSeek R1 + claude-3-5-sonnet-20241022
edit_format: architect
commit_hash: 05a77c7
editor_model: claude-3-5-sonnet-20241022
editor_edit_format: editor-diff
pass_rate_1: 27.1
pass_rate_2: 64.0
pass_num_1: 61
pass_num_2: 144
percent_cases_well_formed: 100.0
error_outputs: 2
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 392
lazy_comments: 6
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 5
total_tests: 225
command: aider --architect --model r1 --editor-model sonnet
date: 2025-01-23
versions: 0.72.3.dev
seconds_per_case: 251.6
total_cost: 13.2933
- dirname: 2025-01-28-16-00-03--qwen-max-2025-01-25-polyglot-diff
test_cases: 225
model: qwen-max-2025-01-25
edit_format: diff
commit_hash: ae7d459
pass_rate_1: 9.3
pass_rate_2: 21.8
pass_num_1: 21
pass_num_2: 49
percent_cases_well_formed: 90.2
error_outputs: 46
num_malformed_responses: 44
num_with_malformed_responses: 22
user_asks: 23
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 9
total_tests: 225
command: OPENAI_API_BASE=https://dashscope-intl.aliyuncs.com/compatible-mode/v1 aider --model openai/qwen-max-2025-01-25
date: 2025-01-28
versions: 0.72.4.dev
seconds_per_case: 39.5
- dirname: 2025-01-31-20-27-46--o3-mini-diff2
test_cases: 225
model: o3-mini (medium)
edit_format: diff
commit_hash: 2fb517b-dirty
pass_rate_1: 19.1
pass_rate_2: 53.8
pass_num_1: 43
pass_num_2: 121
percent_cases_well_formed: 95.1
error_outputs: 28
num_malformed_responses: 28
num_with_malformed_responses: 11
user_asks: 17
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model o3-mini
date: 2025-01-31
versions: 0.72.4.dev
seconds_per_case: 47.2
total_cost: 8.8599
- dirname: 2025-01-31-20-42-47--o3-mini-diff-high
test_cases: 224
model: o3-mini (high)
edit_format: diff
commit_hash: b0d58d1-dirty
pass_rate_1: 21.0
pass_rate_2: 60.4
pass_num_1: 47
pass_num_2: 136
percent_cases_well_formed: 93.3
error_outputs: 26
num_malformed_responses: 24
num_with_malformed_responses: 15
user_asks: 19
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 7
total_tests: 225
command: aider --model o3-mini --reasoning-effort high
date: 2025-01-31
versions: 0.72.4.dev
seconds_per_case: 124.6
total_cost: 18.1584
- dirname: 2025-01-21-22-51-49--gemini-2.0-flash-thinking-exp-01-21-polyglot-diff
test_cases: 225
model: gemini-2.0-flash-thinking-exp-01-21
edit_format: diff
commit_hash: 843720a
pass_rate_1: 5.8
pass_rate_2: 18.2
pass_num_1: 13
pass_num_2: 41
percent_cases_well_formed: 77.8
error_outputs: 182
num_malformed_responses: 180
num_with_malformed_responses: 50
user_asks: 26
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 2
test_timeouts: 7
total_tests: 225
command: aider --model gemini/gemini-2.0-flash-thinking-exp-01-21
date: 2025-01-21
versions: 0.72.2.dev
seconds_per_case: 24.2
total_cost: 0.0000
- dirname: 2025-02-15-19-51-22--chatgpt4o-feb15-diff
test_cases: 223
model: chatgpt-4o-latest (2025-02-15)
edit_format: diff
commit_hash: 108ce18-dirty
pass_rate_1: 9.0
pass_rate_2: 27.1
pass_num_1: 20
pass_num_2: 61
percent_cases_well_formed: 93.3
error_outputs: 66
num_malformed_responses: 21
num_with_malformed_responses: 15
user_asks: 57
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model chatgpt-4o-latest
date: 2025-02-15
versions: 0.74.3.dev
seconds_per_case: 12.4
total_cost: 14.3703
- dirname: 2025-02-24-19-54-07--sonnet37-diff
test_cases: 225
model: claude-3-7-sonnet-20250219 (no thinking)
edit_format: diff
commit_hash: 75e9ee6
pass_rate_1: 24.4
pass_rate_2: 60.4
pass_num_1: 55
pass_num_2: 136
percent_cases_well_formed: 93.3
error_outputs: 16
num_malformed_responses: 16
num_with_malformed_responses: 15
user_asks: 12
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
total_tests: 225
command: aider --model sonnet
date: 2025-02-24
versions: 0.74.4.dev
seconds_per_case: 28.3
total_cost: 17.7191
- dirname: 2025-02-24-21-47-23--sonnet37-diff-think-32k-64k
test_cases: 225
model: claude-3-7-sonnet-20250219 (32k thinking tokens)
edit_format: diff
commit_hash: 60d11a6, 93edbda
pass_rate_1: 29.3
pass_rate_2: 64.9
pass_num_1: 66
pass_num_2: 146
percent_cases_well_formed: 97.8
error_outputs: 66
num_malformed_responses: 5
num_with_malformed_responses: 5
user_asks: 5
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 1
total_tests: 225
command: "aider --model anthropic/claude-3-7-sonnet-20250219 --thinking-tokens 32k"
date: 2025-02-24
versions: 0.75.1.dev
seconds_per_case: 105.2
total_cost: 36.8343
- dirname: 2025-02-27-20-26-15--gpt45-diff3
test_cases: 224
model: gpt-4.5-preview
edit_format: diff
commit_hash: b462e55-dirty
pass_rate_1: 22.3
pass_rate_2: 44.9
pass_num_1: 50
pass_num_2: 101
percent_cases_well_formed: 97.3
error_outputs: 10
num_malformed_responses: 8
num_with_malformed_responses: 6
user_asks: 15
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: aider --model openai/gpt-4.5-preview
date: 2025-02-27
versions: 0.75.2.dev
seconds_per_case: 113.5
total_cost: 183.1802
- dirname: 2025-03-06-17-40-24--qwq32b-diff-temp-topp-ex-sys-remind-user-for-real
test_cases: 225
model: QwQ-32B
edit_format: diff
commit_hash: 51d118f-dirty
pass_rate_1: 8.0
pass_rate_2: 20.9
pass_num_1: 18
pass_num_2: 47
percent_cases_well_formed: 67.6
error_outputs: 145
num_malformed_responses: 143
num_with_malformed_responses: 73
user_asks: 17
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 4
total_tests: 225
command: aider --model fireworks_ai/accounts/fireworks/models/qwq-32b
date: 2025-03-06
versions: 0.75.3.dev
seconds_per_case: 228.6
total_cost: 0.0000
- dirname: 2025-03-07-15-11-27--qwq32b-arch-temp-topp-again
test_cases: 225
model: QwQ-32B + Qwen 2.5 Coder Instruct
edit_format: architect
commit_hash: 52162a5
editor_model: fireworks_ai/accounts/fireworks/models/qwen2p5-coder-32b-instruct
editor_edit_format: editor-diff
pass_rate_1: 9.8
pass_rate_2: 26.2
pass_num_1: 22
pass_num_2: 59
percent_cases_well_formed: 100.0
error_outputs: 122
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 489
lazy_comments: 8
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: aider --model fireworks_ai/accounts/fireworks/models/qwq-32b --architect
date: 2025-03-07
versions: 0.75.3.dev
seconds_per_case: 137.4
total_cost: 0
- dirname: 2025-03-14-23-40-00--cmda-quality-whole2
test_cases: 225
model: command-a-03-2025-quality
edit_format: whole
commit_hash: a1aa63f
pass_rate_1: 2.2
pass_rate_2: 12.0
pass_num_1: 5
pass_num_2: 27
percent_cases_well_formed: 99.6
error_outputs: 2
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 215
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 4
total_tests: 225
command: OPENAI_API_BASE=https://api.cohere.ai/compatibility/v1 aider --model openai/command-a-03-2025-quality
date: 2025-03-14
versions: 0.77.1.dev
seconds_per_case: 85.1
total_cost: 0.0000
- dirname: 2025-03-15-01-21-24--gemma3-27b-or
test_cases: 225
model: gemma-3-27b-it
edit_format: whole
commit_hash: fd21f51-dirty
pass_rate_1: 1.8
pass_rate_2: 4.9
pass_num_1: 4
pass_num_2: 11
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 181
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 3
total_tests: 225
command: aider --model openrouter/google/gemma-3-27b-it
date: 2025-03-15
versions: 0.77.1.dev
seconds_per_case: 79.7
total_cost: 0.0000
- dirname: 2025-03-24-15-41-33--deepseek-v3-0324-polyglot-diff
test_cases: 225
model: DeepSeek V3 (0324)
edit_format: diff
commit_hash: 502b863
pass_rate_1: 28.0
pass_rate_2: 55.1
pass_num_1: 63
pass_num_2: 124
percent_cases_well_formed: 99.6
error_outputs: 32
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 96
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 2
test_timeouts: 4
total_tests: 225
command: aider --model deepseek/deepseek-chat
date: 2025-03-24
versions: 0.78.1.dev
seconds_per_case: 290.0
total_cost: 1.1164
- dirname: 2025-04-12-04-55-50--gemini-25-pro-diff-fenced
test_cases: 225
model: Gemini 2.5 Pro Preview 03-25
edit_format: diff-fenced
commit_hash: 0282574
pass_rate_1: 40.9
pass_rate_2: 72.9
pass_num_1: 92
pass_num_2: 164
percent_cases_well_formed: 92.4
error_outputs: 21
num_malformed_responses: 21
num_with_malformed_responses: 17
user_asks: 69
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model gemini/gemini-2.5-pro-preview-03-25
date: 2025-04-12
versions: 0.81.3.dev
seconds_per_case: 45.3
total_cost: 6.3174
- dirname: 2025-03-29-05-24-55--chatgpt4o-mar28-diff
test_cases: 225
model: chatgpt-4o-latest (2025-03-29)
edit_format: diff
commit_hash: 0decbad
pass_rate_1: 16.4
pass_rate_2: 45.3
pass_num_1: 37
pass_num_2: 102
percent_cases_well_formed: 64.4
error_outputs: 85
num_malformed_responses: 85
num_with_malformed_responses: 80
user_asks: 174
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 4
total_tests: 225
command: aider --model chatgpt-4o-latest
date: 2025-03-29
versions: 0.79.3.dev
seconds_per_case: 10.3
total_cost: 19.7416
- dirname: 2025-04-04-02-57-25--qalpha-diff-exsys
test_cases: 225
model: Quasar Alpha
edit_format: diff
commit_hash: 8a34a6c-dirty
pass_rate_1: 21.8
pass_rate_2: 54.7
pass_num_1: 49
pass_num_2: 123
percent_cases_well_formed: 98.2
error_outputs: 4
num_malformed_responses: 4
num_with_malformed_responses: 4
user_asks: 187
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 4
total_tests: 225
command: aider --model openrouter/openrouter/quasar-alpha
date: 2025-04-04
versions: 0.80.5.dev
seconds_per_case: 14.8
total_cost: 0.0000
- dirname: 2025-04-06-08-39-52--llama-4-maverick-17b-128e-instruct-polyglot-whole
test_cases: 225
model: Llama 4 Maverick
edit_format: whole
commit_hash: 9445a31
pass_rate_1: 4.4
pass_rate_2: 15.6
pass_num_1: 10
pass_num_2: 35
percent_cases_well_formed: 99.1
error_outputs: 12
num_malformed_responses: 2
num_with_malformed_responses: 2
user_asks: 248
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 4
total_tests: 225
command: aider --model nvidia_nim/meta/llama-4-maverick-17b-128e-instruct
date: 2025-04-06
versions: 0.81.2.dev
seconds_per_case: 20.5
total_cost: 0.0000
- dirname: 2025-04-10-04-21-31--grok3-diff-exuser
test_cases: 225
model: Grok 3 Beta
edit_format: diff
commit_hash: 2dd40fc-dirty
pass_rate_1: 22.2
pass_rate_2: 53.3
pass_num_1: 50
pass_num_2: 120
percent_cases_well_formed: 99.6
error_outputs: 1
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 68
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model openrouter/x-ai/grok-3-beta
date: 2025-04-10
versions: 0.81.2.dev
seconds_per_case: 15.3
total_cost: 11.0338
- dirname: 2025-04-10-18-47-24--grok3-mini-whole-exuser
test_cases: 225
model: Grok 3 Mini Beta (low)
edit_format: whole
commit_hash: 14ffe77-dirty
pass_rate_1: 11.1
pass_rate_2: 34.7
pass_num_1: 25
pass_num_2: 78
percent_cases_well_formed: 100.0
error_outputs: 3
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 73
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 5
total_tests: 225
command: aider --model openrouter/x-ai/grok-3-mini-beta
date: 2025-04-10
versions: 0.81.2.dev
seconds_per_case: 35.1
total_cost: 0.7856
- dirname: 2025-04-10-23-59-02--xai-grok3-mini-whole-high
test_cases: 225
model: Grok 3 Mini Beta (high)
edit_format: whole
commit_hash: 8ee33da-dirty
pass_rate_1: 17.3
pass_rate_2: 49.3
pass_num_1: 39
pass_num_2: 111
percent_cases_well_formed: 99.6
error_outputs: 1
num_malformed_responses: 1
num_with_malformed_responses: 1
user_asks: 64
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
total_tests: 225
command: aider --model xai/grok-3-mini-beta --reasoning-effort high
date: 2025-04-10
versions: 0.81.3.dev
seconds_per_case: 79.1
total_cost: 0.7346
- dirname: 2025-04-10-19-02-44--oalpha-diff-exsys
test_cases: 225
model: Optimus Alpha
edit_format: diff
commit_hash: 532bc45-dirty
pass_rate_1: 21.3
pass_rate_2: 52.9
pass_num_1: 48
pass_num_2: 119
percent_cases_well_formed: 97.3
error_outputs: 7
num_malformed_responses: 6
num_with_malformed_responses: 6
user_asks: 182
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 3
total_tests: 225
command: aider --model openrouter/openrouter/optimus-alpha
date: 2025-04-10
versions: 0.81.2.dev
seconds_per_case: 18.4
total_cost: 0.0000
- dirname: 2025-04-14-21-05-54--gpt41-diff-exuser
test_cases: 225
model: gpt-4.1
edit_format: diff
commit_hash: 7a87db5-dirty
pass_rate_1: 20.0
pass_rate_2: 52.4
pass_num_1: 45
pass_num_2: 118
percent_cases_well_formed: 98.2
error_outputs: 6
num_malformed_responses: 5
num_with_malformed_responses: 4
user_asks: 171
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 5
total_tests: 225
command: aider --model gpt-4.1
date: 2025-04-14
versions: 0.81.4.dev
seconds_per_case: 20.5
total_cost: 9.8556
- dirname: 2025-04-14-21-27-53--gpt41mini-diff
test_cases: 225
model: gpt-4.1-mini
edit_format: diff
commit_hash: ffb743e-dirty
pass_rate_1: 11.1
pass_rate_2: 32.4
pass_num_1: 25
pass_num_2: 73
percent_cases_well_formed: 92.4
error_outputs: 64
num_malformed_responses: 62
num_with_malformed_responses: 17
user_asks: 159
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 2
test_timeouts: 2
total_tests: 225
command: aider --model gpt-4.1-mini
date: 2025-04-14
versions: 0.81.4.dev
seconds_per_case: 19.5
total_cost: 1.9918
- dirname: 2025-04-14-22-46-01--gpt41nano-diff
test_cases: 225
model: gpt-4.1-nano
edit_format: whole
commit_hash: 71d1591-dirty
pass_rate_1: 3.1
pass_rate_2: 8.9
pass_num_1: 7
pass_num_2: 20
percent_cases_well_formed: 94.2
error_outputs: 20
num_malformed_responses: 20
num_with_malformed_responses: 13
user_asks: 316
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 8
total_tests: 225
command: aider --model gpt-4.1-nano
date: 2025-04-14
versions: 0.81.4.dev
seconds_per_case: 12.0
total_cost: 0.4281
- dirname: 2025-04-16-21-20-55--o3-high-diff-temp0-exsys
test_cases: 225
model: o3 (high)
edit_format: diff
commit_hash: 24805ff-dirty
pass_rate_1: 36.9
pass_rate_2: 79.6
pass_num_1: 83
pass_num_2: 179
percent_cases_well_formed: 95.1
error_outputs: 11
num_malformed_responses: 11
num_with_malformed_responses: 11
user_asks: 110
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 2
total_tests: 225
command: aider --model o3
date: 2025-04-16
versions: 0.82.1.dev
seconds_per_case: 113.8
total_cost: 111.0325
- dirname: 2025-04-16-22-01-58--o4-mini-high-diff-exsys
test_cases: 225
model: o4-mini (high)
edit_format: diff
commit_hash: b66901f-dirty
pass_rate_1: 19.6
pass_rate_2: 72.0
pass_num_1: 44
pass_num_2: 162
percent_cases_well_formed: 90.7
error_outputs: 26
num_malformed_responses: 24
num_with_malformed_responses: 21
user_asks: 66
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 1
test_timeouts: 2
total_tests: 225
command: aider --model o4-mini
date: 2025-04-16
versions: 0.82.1.dev
seconds_per_case: 176.5
total_cost: 19.6399
- dirname: 2025-04-17-01-20-35--o3-mini-high-diff-arch
test_cases: 225
model: o3 (high) + gpt-4.1
edit_format: architect
commit_hash: 80909e1-dirty
editor_model: gpt-4.1
editor_edit_format: editor-diff
pass_rate_1: 36.0
pass_rate_2: 82.7
pass_num_1: 81
pass_num_2: 186
percent_cases_well_formed: 100.0
error_outputs: 9
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 166
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 0
total_tests: 225
command: aider --model o3 --architect
date: 2025-04-17
versions: 0.82.2.dev
seconds_per_case: 110.0
total_cost: 69.2921
- dirname: 2025-04-19-14-43-04--o4-mini-patch
test_cases: 225
model: openhands-lm-32b-v0.1
edit_format: whole
commit_hash: c08336f
pass_rate_1: 4.0
pass_rate_2: 10.2
pass_num_1: 9
pass_num_2: 23
percent_cases_well_formed: 95.1
error_outputs: 55
num_malformed_responses: 41
num_with_malformed_responses: 11
user_asks: 166
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 0
test_timeouts: 11
total_tests: 225
command: aider --model openrouter/all-hands/openhands-lm-32b-v0.1
date: 2025-04-19
versions: 0.82.2.dev
seconds_per_case: 195.6
total_cost: 0.0000
- dirname: 2025-04-20-19-54-31--flash25-diff-no-think
test_cases: 225
model: gemini-2.5-flash-preview-04-17 (default)
edit_format: diff
commit_hash: 7fcce5d-dirty
pass_rate_1: 21.8
pass_rate_2: 47.1
pass_num_1: 49
pass_num_2: 106
percent_cases_well_formed: 85.3
error_outputs: 60
num_malformed_responses: 55
num_with_malformed_responses: 33
user_asks: 82
lazy_comments: 1
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 5
test_timeouts: 4
total_tests: 225
command: aider --model gemini/gemini-2.5-flash-preview-04-17
date: 2025-04-20
versions: 0.82.3.dev
seconds_per_case: 50.1
total_cost: 1.8451