Feat: Add --reasoning-effort switch to benchmark script

2025-05-29 00:35:00 +00:00 · 2025-04-17 20:01:26 -07:00 · 2025-04-17 20:01:26 -07:00 · 8e689d35af
commit 8e689d35af
parent 50fd544070
1 changed files with 17 additions and 0 deletions
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -206,6 +206,9 @@ def main(
    read_model_settings: str = typer.Option(
        None, "--read-model-settings", help="Load aider model settings from YAML file"
    ),
+    reasoning_effort: Optional[float] = typer.Option(
+        None, "--reasoning-effort", help="Set reasoning effort for models that support it"
+    ),
    exercises_dir: str = typer.Option(
        EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
    ),
@ -362,6 +365,7 @@ def main(
                editor_edit_format,
                num_ctx,
                sleep,
+                reasoning_effort,
            )

            all_results.append(results)
@ -384,6 +388,7 @@ def main(
                replay,
                editor_model,
                editor_edit_format,
+                reasoning_effort,
            )
        all_results = run_test_threaded.gather(tqdm=True)

@ -481,6 +486,7 @@ def summarize_results(dirname, stats_languages=None):
    res.indentation_errors = 0
    res.lazy_comments = 0

+    res.reasoning_effort = None
    variants = defaultdict(set)

    for results in all_results:
@ -509,7 +515,10 @@ def summarize_results(dirname, stats_languages=None):
        res.syntax_errors += results.get("syntax_errors", 0)
        res.indentation_errors += results.get("indentation_errors", 0)

+        res.reasoning_effort = results.get("reasoning_effort")
+
        for key in "model edit_format commit_hash editor_model editor_edit_format".split():
+
            val = results.get(key)
            if val:
                variants[key].add(val)
@ -552,6 +561,9 @@ def summarize_results(dirname, stats_languages=None):
        setattr(res, key, val)
        console.print(f"  {key}: {val}", style=style)

+    if res.reasoning_effort is not None:
+        print(f"  reasoning_effort: {res.reasoning_effort}")
+
    for i in range(tries):
        print(f"  pass_rate_{i + 1}: {percents[i]:.1f}")
    for i in range(tries):
@ -663,6 +675,7 @@ def run_test_real(
    editor_edit_format,
    num_ctx=None,
    sleep=0,
+    reasoning_effort=None,
    read_model_settings=None,
 ):
    if not os.path.isdir(testdir):
@ -769,6 +782,9 @@ def run_test_real(
        editor_edit_format=editor_edit_format,
    )

+    if reasoning_effort is not None:
+        main_model.set_reasoning_effort(reasoning_effort)
+
    dump(main_model.max_chat_history_tokens)

    if num_ctx:
@ -919,6 +935,7 @@ def run_test_real(
        syntax_errors=syntax_errors,
        indentation_errors=indentation_errors,
        lazy_comments=lazy_comments,  # Add the count of pattern matches to the results
+        reasoning_effort=reasoning_effort,
        chat_hashes=list(
            zip(
                coder.chat_completion_call_hashes,