mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-29 16:54:59 +00:00
feat: Add --thinking-tokens option to benchmark script
This commit is contained in:
parent
20a29e5cd1
commit
1a4d3927e7
1 changed files with 16 additions and 3 deletions
|
@ -209,6 +209,9 @@ def main(
|
||||||
reasoning_effort: Optional[str] = typer.Option(
|
reasoning_effort: Optional[str] = typer.Option(
|
||||||
None, "--reasoning-effort", help="Set reasoning effort for models that support it"
|
None, "--reasoning-effort", help="Set reasoning effort for models that support it"
|
||||||
),
|
),
|
||||||
|
thinking_tokens: Optional[int] = typer.Option(
|
||||||
|
None, "--thinking-tokens", help="Set thinking tokens for models that support it"
|
||||||
|
),
|
||||||
exercises_dir: str = typer.Option(
|
exercises_dir: str = typer.Option(
|
||||||
EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
|
EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
|
||||||
),
|
),
|
||||||
|
@ -366,6 +369,7 @@ def main(
|
||||||
num_ctx,
|
num_ctx,
|
||||||
sleep,
|
sleep,
|
||||||
reasoning_effort,
|
reasoning_effort,
|
||||||
|
thinking_tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
all_results.append(results)
|
all_results.append(results)
|
||||||
|
@ -391,6 +395,7 @@ def main(
|
||||||
num_ctx,
|
num_ctx,
|
||||||
sleep,
|
sleep,
|
||||||
reasoning_effort,
|
reasoning_effort,
|
||||||
|
thinking_tokens,
|
||||||
)
|
)
|
||||||
all_results = run_test_threaded.gather(tqdm=True)
|
all_results = run_test_threaded.gather(tqdm=True)
|
||||||
|
|
||||||
|
@ -489,6 +494,7 @@ def summarize_results(dirname, stats_languages=None):
|
||||||
res.lazy_comments = 0
|
res.lazy_comments = 0
|
||||||
|
|
||||||
res.reasoning_effort = None
|
res.reasoning_effort = None
|
||||||
|
res.thinking_tokens = None
|
||||||
variants = defaultdict(set)
|
variants = defaultdict(set)
|
||||||
|
|
||||||
for results in all_results:
|
for results in all_results:
|
||||||
|
@ -518,6 +524,7 @@ def summarize_results(dirname, stats_languages=None):
|
||||||
res.indentation_errors += results.get("indentation_errors", 0)
|
res.indentation_errors += results.get("indentation_errors", 0)
|
||||||
|
|
||||||
res.reasoning_effort = results.get("reasoning_effort")
|
res.reasoning_effort = results.get("reasoning_effort")
|
||||||
|
res.thinking_tokens = results.get("thinking_tokens")
|
||||||
|
|
||||||
for key in "model edit_format commit_hash editor_model editor_edit_format".split():
|
for key in "model edit_format commit_hash editor_model editor_edit_format".split():
|
||||||
val = results.get(key)
|
val = results.get(key)
|
||||||
|
@ -564,6 +571,8 @@ def summarize_results(dirname, stats_languages=None):
|
||||||
|
|
||||||
if res.reasoning_effort is not None:
|
if res.reasoning_effort is not None:
|
||||||
print(f" reasoning_effort: {res.reasoning_effort}")
|
print(f" reasoning_effort: {res.reasoning_effort}")
|
||||||
|
if res.thinking_tokens is not None:
|
||||||
|
print(f" thinking_tokens: {res.thinking_tokens}")
|
||||||
|
|
||||||
for i in range(tries):
|
for i in range(tries):
|
||||||
print(f" pass_rate_{i + 1}: {percents[i]:.1f}")
|
print(f" pass_rate_{i + 1}: {percents[i]:.1f}")
|
||||||
|
@ -650,15 +659,14 @@ def get_replayed_content(replay_dname, test_dname):
|
||||||
def run_test(original_dname, testdir, *args, **kwargs):
|
def run_test(original_dname, testdir, *args, **kwargs):
|
||||||
try:
|
try:
|
||||||
return run_test_real(original_dname, testdir, *args, **kwargs)
|
return run_test_real(original_dname, testdir, *args, **kwargs)
|
||||||
except Exception as err:
|
except Exception:
|
||||||
print("=" * 40)
|
print("=" * 40)
|
||||||
print("Test failed")
|
print("Test failed")
|
||||||
print(err)
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
testdir = Path(testdir)
|
testdir = Path(testdir)
|
||||||
results_fname = testdir / ".aider.results.json"
|
results_fname = testdir / ".aider.results.json"
|
||||||
results_fname.write_text(json.dumps(dict(exception=str(err))))
|
results_fname.write_text(json.dumps(dict(exception=traceback.format_exc())))
|
||||||
|
|
||||||
|
|
||||||
def run_test_real(
|
def run_test_real(
|
||||||
|
@ -677,6 +685,7 @@ def run_test_real(
|
||||||
num_ctx=None,
|
num_ctx=None,
|
||||||
sleep=0,
|
sleep=0,
|
||||||
reasoning_effort: Optional[str] = None,
|
reasoning_effort: Optional[str] = None,
|
||||||
|
thinking_tokens: Optional[int] = None,
|
||||||
read_model_settings=None,
|
read_model_settings=None,
|
||||||
):
|
):
|
||||||
if not os.path.isdir(testdir):
|
if not os.path.isdir(testdir):
|
||||||
|
@ -787,6 +796,9 @@ def run_test_real(
|
||||||
if reasoning_effort is not None:
|
if reasoning_effort is not None:
|
||||||
main_model.set_reasoning_effort(reasoning_effort)
|
main_model.set_reasoning_effort(reasoning_effort)
|
||||||
|
|
||||||
|
if thinking_tokens is not None:
|
||||||
|
main_model.set_thinking_tokens(thinking_tokens)
|
||||||
|
|
||||||
dump(main_model.max_chat_history_tokens)
|
dump(main_model.max_chat_history_tokens)
|
||||||
|
|
||||||
if num_ctx:
|
if num_ctx:
|
||||||
|
@ -938,6 +950,7 @@ def run_test_real(
|
||||||
indentation_errors=indentation_errors,
|
indentation_errors=indentation_errors,
|
||||||
lazy_comments=lazy_comments, # Add the count of pattern matches to the results
|
lazy_comments=lazy_comments, # Add the count of pattern matches to the results
|
||||||
reasoning_effort=reasoning_effort,
|
reasoning_effort=reasoning_effort,
|
||||||
|
thinking_tokens=thinking_tokens,
|
||||||
chat_hashes=list(
|
chat_hashes=list(
|
||||||
zip(
|
zip(
|
||||||
coder.chat_completion_call_hashes,
|
coder.chat_completion_call_hashes,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue