feat: Add new benchmark test case for qwen-2.5-72b-instruct-diff model

2025-06-02 18:54:59 +00:00 · 2024-09-20 13:27:58 -07:00 · 2024-09-20 13:27:58 -07:00 · 2753ac6b62
commit 2753ac6b62
parent 5139594fa0
2 changed files with 16 additions and 16 deletions
--- a/aider/website/_data/edit_leaderboard.yml
+++ b/aider/website/_data/edit_leaderboard.yml
@ -1201,26 +1201,26 @@
  versions: 0.56.0
  seconds_per_case: 9.3
  total_cost: 0.0000
-
- dirname: 2024-09-20-20-07-27--qwen-2.5-72b-instruct
+  
+- dirname: 2024-09-20-20-20-19--qwen-2.5-72b-instruct-diff
  test_cases: 133
  model: qwen-2.5-72b-instruct (bf16)
-  edit_format: whole
-  commit_hash: d7051ce
-  pass_rate_1: 52.6
-  pass_rate_2: 64.7
-  percent_cases_well_formed: 100.0
-  error_outputs: 0
-  num_malformed_responses: 0
-  num_with_malformed_responses: 0
-  user_asks: 2
+  edit_format: diff
+  commit_hash: 5139594
+  pass_rate_1: 53.4
+  pass_rate_2: 65.4
+  percent_cases_well_formed: 96.2
+  error_outputs: 9
+  num_malformed_responses: 9
+  num_with_malformed_responses: 5
+  user_asks: 3
  lazy_comments: 0
-  syntax_errors: 0
-  indentation_errors: 0
+  syntax_errors: 2
+  indentation_errors: 1
  exhausted_context_windows: 0
-  test_timeouts: 0
+  test_timeouts: 3
  command: aider --model openrouter/qwen/qwen-2.5-72b-instruct
  date: 2024-09-20
  versions: 0.56.1.dev
-  seconds_per_case: 53.9
+  seconds_per_case: 39.8
  total_cost: 0.0000
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -549,7 +549,7 @@ def run_test_real(
        chat_history_file=history_fname,
    )

-    main_model = models.Model(model_name)
+    main_model = models.Model(model_name, weak_model=model_name)
    edit_format = edit_format or main_model.edit_format

    dump(main_model)