Feat: Add --reasoning-effort switch to benchmark script

This commit is contained in:
Paul Gauthier (aider) 2025-04-17 20:01:26 -07:00
parent 50fd544070
commit 8e689d35af

View file

@ -206,6 +206,9 @@ def main(
read_model_settings: str = typer.Option(
None, "--read-model-settings", help="Load aider model settings from YAML file"
),
reasoning_effort: Optional[float] = typer.Option(
None, "--reasoning-effort", help="Set reasoning effort for models that support it"
),
exercises_dir: str = typer.Option(
EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
),
@ -362,6 +365,7 @@ def main(
editor_edit_format,
num_ctx,
sleep,
reasoning_effort,
)
all_results.append(results)
@ -384,6 +388,7 @@ def main(
replay,
editor_model,
editor_edit_format,
reasoning_effort,
)
all_results = run_test_threaded.gather(tqdm=True)
@ -481,6 +486,7 @@ def summarize_results(dirname, stats_languages=None):
res.indentation_errors = 0
res.lazy_comments = 0
res.reasoning_effort = None
variants = defaultdict(set)
for results in all_results:
@ -509,7 +515,10 @@ def summarize_results(dirname, stats_languages=None):
res.syntax_errors += results.get("syntax_errors", 0)
res.indentation_errors += results.get("indentation_errors", 0)
res.reasoning_effort = results.get("reasoning_effort")
for key in "model edit_format commit_hash editor_model editor_edit_format".split():
val = results.get(key)
if val:
variants[key].add(val)
@ -552,6 +561,9 @@ def summarize_results(dirname, stats_languages=None):
setattr(res, key, val)
console.print(f" {key}: {val}", style=style)
if res.reasoning_effort is not None:
print(f" reasoning_effort: {res.reasoning_effort}")
for i in range(tries):
print(f" pass_rate_{i + 1}: {percents[i]:.1f}")
for i in range(tries):
@ -663,6 +675,7 @@ def run_test_real(
editor_edit_format,
num_ctx=None,
sleep=0,
reasoning_effort=None,
read_model_settings=None,
):
if not os.path.isdir(testdir):
@ -769,6 +782,9 @@ def run_test_real(
editor_edit_format=editor_edit_format,
)
if reasoning_effort is not None:
main_model.set_reasoning_effort(reasoning_effort)
dump(main_model.max_chat_history_tokens)
if num_ctx:
@ -919,6 +935,7 @@ def run_test_real(
syntax_errors=syntax_errors,
indentation_errors=indentation_errors,
lazy_comments=lazy_comments, # Add the count of pattern matches to the results
reasoning_effort=reasoning_effort,
chat_hashes=list(
zip(
coder.chat_completion_call_hashes,