mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-30 01:04:59 +00:00
Merge 3a93da8f8d
into 3caab85931
This commit is contained in:
commit
e7b2514c07
2 changed files with 195 additions and 7 deletions
|
@ -83,6 +83,21 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b
|
||||||
- `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
|
- `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
|
||||||
- `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
|
- `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
|
||||||
- `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
|
- `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
|
||||||
|
- `--resume` resume a previously paused benchmark run from its checkpoint
|
||||||
|
- `--edit-format architect` run in architect mode, which uses two models: one to propose changes and another to implement them
|
||||||
|
- `--editor-model` specify the model to use for implementing changes in architect mode
|
||||||
|
- `--reasoning-effort` set reasoning effort for models that support it (e.g., "high", "medium", "low")
|
||||||
|
|
||||||
|
### Pausing and Resuming Benchmarks
|
||||||
|
|
||||||
|
Benchmarks can take a long time to run. You can pause a running benchmark by pressing `Ctrl+C` once. The benchmark will complete the current test and then save a checkpoint before exiting. To resume the benchmark later, use the `--resume` flag:
|
||||||
|
|
||||||
|
```
|
||||||
|
# Resume a previously paused benchmark
|
||||||
|
./benchmark/benchmark.py YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run --resume --model gpt-3.5-turbo --edit-format whole --threads 10
|
||||||
|
```
|
||||||
|
|
||||||
|
When you resume a benchmark, it will pick up where it left off, using the list of pending tests from the checkpoint file. This allows you to run benchmarks over multiple sessions.
|
||||||
|
|
||||||
### Benchmark report
|
### Benchmark report
|
||||||
|
|
||||||
|
@ -137,6 +152,24 @@ should be enough to reliably reproduce any benchmark run.
|
||||||
You can see examples of the benchmark report yaml in the
|
You can see examples of the benchmark report yaml in the
|
||||||
[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
|
[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
|
||||||
|
|
||||||
|
### Running benchmarks in architect mode
|
||||||
|
|
||||||
|
Architect mode uses two models: a main model that proposes changes and an editor model that implements them. This can be particularly useful for models that are good at reasoning but struggle with precise code edits.
|
||||||
|
|
||||||
|
Here's an example of running a benchmark in architect mode:
|
||||||
|
|
||||||
|
```
|
||||||
|
./benchmark/benchmark.py grook-mini-architect-deepseek-editor --model openrouter/x-ai/grok-3-mini-beta --editor-model openrouter/deepseek/deepseek-chat-v3-0324 --edit-format architect --threads 15 --exercises-dir polyglot-benchmark --reasoning-effort high
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example:
|
||||||
|
- The main model is Grok-3-mini-beta (via OpenRouter)
|
||||||
|
- The editor model is DeepSeek Chat v3 (via OpenRouter)
|
||||||
|
- The edit format is set to "architect"
|
||||||
|
- Reasoning effort is set to "high"
|
||||||
|
- 15 threads are used for parallel processing
|
||||||
|
|
||||||
|
When running in architect mode, the benchmark report will include additional information about the editor model used.
|
||||||
|
|
||||||
## Limitations, notes
|
## Limitations, notes
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
@ -13,7 +14,7 @@ from collections import defaultdict
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
from typing import List, Optional
|
from typing import Dict, List, Optional, Set
|
||||||
|
|
||||||
import git
|
import git
|
||||||
import importlib_resources
|
import importlib_resources
|
||||||
|
@ -40,6 +41,61 @@ app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
|
||||||
load_dotenv(override=True)
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Global variable to track if we should pause the benchmark
|
||||||
|
PAUSE_REQUESTED = False
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(dirname: Path, completed_tests: Set[str], pending_tests: List[str]) -> None:
|
||||||
|
"""Save the current benchmark progress to a checkpoint file."""
|
||||||
|
checkpoint_file = dirname / ".benchmark_checkpoint.json"
|
||||||
|
checkpoint_data = {
|
||||||
|
"timestamp": datetime.datetime.now().isoformat(),
|
||||||
|
"completed_tests": list(completed_tests),
|
||||||
|
"pending_tests": pending_tests,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(checkpoint_file, "w") as f:
|
||||||
|
json.dump(checkpoint_data, indent=4, sort_keys=True, default=str, fp=f)
|
||||||
|
|
||||||
|
print(f"\nCheckpoint saved to {checkpoint_file}")
|
||||||
|
print(f"Completed: {len(completed_tests)} tests")
|
||||||
|
print(f"Pending: {len(pending_tests)} tests")
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint(dirname: Path) -> Dict:
|
||||||
|
"""Load the benchmark progress from a checkpoint file."""
|
||||||
|
checkpoint_file = dirname / ".benchmark_checkpoint.json"
|
||||||
|
|
||||||
|
if not checkpoint_file.exists():
|
||||||
|
return {"completed_tests": [], "pending_tests": []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(checkpoint_file) as f:
|
||||||
|
checkpoint_data = json.load(f)
|
||||||
|
|
||||||
|
print(f"\nLoaded checkpoint from {checkpoint_file}")
|
||||||
|
print(f"Checkpoint timestamp: {checkpoint_data.get('timestamp', 'unknown')}")
|
||||||
|
print(f"Completed: {len(checkpoint_data.get('completed_tests', []))} tests")
|
||||||
|
print(f"Pending: {len(checkpoint_data.get('pending_tests', []))} tests")
|
||||||
|
|
||||||
|
return checkpoint_data
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
print(f"Error loading checkpoint file: {e}")
|
||||||
|
return {"completed_tests": [], "pending_tests": []}
|
||||||
|
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
"""Handle Ctrl+C by setting the pause flag instead of terminating immediately."""
|
||||||
|
global PAUSE_REQUESTED
|
||||||
|
if PAUSE_REQUESTED:
|
||||||
|
print("\nForce quitting...")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print("\nPause requested. Will stop after current test completes...")
|
||||||
|
print("(Press Ctrl+C again to force quit)")
|
||||||
|
PAUSE_REQUESTED = True
|
||||||
|
|
||||||
|
|
||||||
def find_latest_benchmark_dir():
|
def find_latest_benchmark_dir():
|
||||||
benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
|
benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
|
||||||
if not benchmark_dirs:
|
if not benchmark_dirs:
|
||||||
|
@ -184,6 +240,7 @@ def main(
|
||||||
False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
|
False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
|
||||||
),
|
),
|
||||||
cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
|
cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
|
||||||
|
resume: bool = typer.Option(False, "--resume", help="Resume from checkpoint in the matching testdir"),
|
||||||
make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"),
|
make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"),
|
||||||
no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
|
no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
|
||||||
no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
|
no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
|
||||||
|
@ -225,6 +282,9 @@ def main(
|
||||||
latest_dir = find_latest_benchmark_dir()
|
latest_dir = find_latest_benchmark_dir()
|
||||||
dirnames = [str(latest_dir)]
|
dirnames = [str(latest_dir)]
|
||||||
|
|
||||||
|
# Register signal handler for graceful pausing
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
if dirnames is None:
|
if dirnames is None:
|
||||||
dirnames = []
|
dirnames = []
|
||||||
|
|
||||||
|
@ -235,7 +295,7 @@ def main(
|
||||||
updated_dirnames = []
|
updated_dirnames = []
|
||||||
for dirname in dirnames:
|
for dirname in dirnames:
|
||||||
dirname = Path(dirname)
|
dirname = Path(dirname)
|
||||||
dirname = resolve_dirname(dirname, stats_only or cont, make_new)
|
dirname = resolve_dirname(dirname, stats_only or cont or resume, make_new)
|
||||||
if not dirname:
|
if not dirname:
|
||||||
return 1
|
return 1
|
||||||
updated_dirnames.append(dirname)
|
updated_dirnames.append(dirname)
|
||||||
|
@ -340,8 +400,28 @@ def main(
|
||||||
keywords = keywords.split(",")
|
keywords = keywords.split(",")
|
||||||
test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
|
test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
|
||||||
|
|
||||||
random.shuffle(test_dnames)
|
# Load checkpoint if resuming
|
||||||
if num_tests > 0:
|
checkpoint_data = {}
|
||||||
|
completed_tests = set()
|
||||||
|
|
||||||
|
if resume:
|
||||||
|
checkpoint_data = load_checkpoint(dirname)
|
||||||
|
completed_tests = set(checkpoint_data.get("completed_tests", []))
|
||||||
|
|
||||||
|
# If we have pending tests from a previous run, use those instead of shuffling
|
||||||
|
pending_tests = checkpoint_data.get("pending_tests", [])
|
||||||
|
if pending_tests:
|
||||||
|
print(f"Resuming with {len(pending_tests)} pending tests from checkpoint")
|
||||||
|
test_dnames = pending_tests
|
||||||
|
else:
|
||||||
|
# Filter out already completed tests
|
||||||
|
test_dnames = [dn for dn in test_dnames if dn not in completed_tests]
|
||||||
|
random.shuffle(test_dnames)
|
||||||
|
else:
|
||||||
|
# Normal operation - shuffle and limit tests
|
||||||
|
random.shuffle(test_dnames)
|
||||||
|
|
||||||
|
if num_tests > 0 and not resume:
|
||||||
test_dnames = test_dnames[:num_tests]
|
test_dnames = test_dnames[:num_tests]
|
||||||
|
|
||||||
# Don't give up when benchmarking
|
# Don't give up when benchmarking
|
||||||
|
@ -352,7 +432,15 @@ def main(
|
||||||
|
|
||||||
if threads == 1:
|
if threads == 1:
|
||||||
all_results = []
|
all_results = []
|
||||||
|
remaining_tests = test_dnames.copy()
|
||||||
|
|
||||||
for test_path in test_dnames:
|
for test_path in test_dnames:
|
||||||
|
# Check if we should pause
|
||||||
|
if PAUSE_REQUESTED:
|
||||||
|
print("\nPausing benchmark as requested...")
|
||||||
|
save_checkpoint(dirname, completed_tests, remaining_tests)
|
||||||
|
return 0
|
||||||
|
|
||||||
results = run_test(
|
results = run_test(
|
||||||
original_dname,
|
original_dname,
|
||||||
dirname / test_path,
|
dirname / test_path,
|
||||||
|
@ -373,10 +461,21 @@ def main(
|
||||||
)
|
)
|
||||||
|
|
||||||
all_results.append(results)
|
all_results.append(results)
|
||||||
|
|
||||||
|
# Update completed and remaining tests
|
||||||
|
if results:
|
||||||
|
completed_tests.add(test_path)
|
||||||
|
remaining_tests.remove(test_path)
|
||||||
|
|
||||||
|
# Save checkpoint after each test
|
||||||
|
save_checkpoint(dirname, completed_tests, remaining_tests)
|
||||||
|
|
||||||
summarize_results(dirname)
|
summarize_results(dirname)
|
||||||
if sleep:
|
if sleep:
|
||||||
time.sleep(sleep)
|
time.sleep(sleep)
|
||||||
else:
|
else:
|
||||||
|
# For threaded execution, we can't easily pause in the middle
|
||||||
|
# So we'll just run all tests and save a checkpoint at the end
|
||||||
run_test_threaded = lox.thread(threads)(run_test)
|
run_test_threaded = lox.thread(threads)(run_test)
|
||||||
for test_path in test_dnames:
|
for test_path in test_dnames:
|
||||||
run_test_threaded.scatter(
|
run_test_threaded.scatter(
|
||||||
|
@ -399,11 +498,29 @@ def main(
|
||||||
)
|
)
|
||||||
all_results = run_test_threaded.gather(tqdm=True)
|
all_results = run_test_threaded.gather(tqdm=True)
|
||||||
|
|
||||||
|
# Update completed tests based on results
|
||||||
|
for test_path, result in zip(test_dnames, all_results):
|
||||||
|
if result:
|
||||||
|
completed_tests.add(test_path)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
summarize_results(dirname)
|
summarize_results(dirname)
|
||||||
|
|
||||||
|
# Save final checkpoint
|
||||||
|
remaining_tests = [t for t in test_dnames if t not in completed_tests]
|
||||||
|
save_checkpoint(dirname, completed_tests, remaining_tests)
|
||||||
|
|
||||||
|
if PAUSE_REQUESTED:
|
||||||
|
print("\nBenchmark paused. To resume, run:")
|
||||||
|
print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
|
||||||
|
elif not remaining_tests:
|
||||||
|
print("\nAll tests completed successfully!")
|
||||||
|
else:
|
||||||
|
print(f"\n{len(remaining_tests)} tests were not completed. To resume, run:")
|
||||||
|
print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -480,6 +597,11 @@ def summarize_results(dirname, stats_languages=None):
|
||||||
|
|
||||||
passed_tests = [0] * tries
|
passed_tests = [0] * tries
|
||||||
|
|
||||||
|
# Initialize language-specific tracking
|
||||||
|
languages = set()
|
||||||
|
language_tests = defaultdict(int)
|
||||||
|
language_passed = defaultdict(lambda: [0] * tries)
|
||||||
|
|
||||||
res.completed_tests = 0
|
res.completed_tests = 0
|
||||||
res.duration = 0
|
res.duration = 0
|
||||||
res.cost = 0
|
res.cost = 0
|
||||||
|
@ -510,6 +632,15 @@ def summarize_results(dirname, stats_languages=None):
|
||||||
for i in range(len(tests_outcomes) - 1, tries):
|
for i in range(len(tests_outcomes) - 1, tries):
|
||||||
passed_tests[i] += 1
|
passed_tests[i] += 1
|
||||||
|
|
||||||
|
# Track language-specific results
|
||||||
|
language = results.get("language")
|
||||||
|
if language:
|
||||||
|
languages.add(language)
|
||||||
|
language_tests[language] += 1
|
||||||
|
if passed:
|
||||||
|
for i in range(len(tests_outcomes) - 1, tries):
|
||||||
|
language_passed[language][i] += 1
|
||||||
|
|
||||||
res.cost += results.get("cost", 0)
|
res.cost += results.get("cost", 0)
|
||||||
res.duration += results.get("duration", 0)
|
res.duration += results.get("duration", 0)
|
||||||
res.test_timeouts += results.get("test_timeouts", 0)
|
res.test_timeouts += results.get("test_timeouts", 0)
|
||||||
|
@ -587,6 +718,21 @@ def summarize_results(dirname, stats_languages=None):
|
||||||
pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
|
pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
|
||||||
print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}")
|
print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}")
|
||||||
|
|
||||||
|
# Display language-specific pass rates
|
||||||
|
if languages:
|
||||||
|
# Process language-specific pass rates without breaking YAML format
|
||||||
|
for language in sorted(languages):
|
||||||
|
if language_tests[language] > 0:
|
||||||
|
# Only print pass rate 2 for each language
|
||||||
|
if tries >= 2: # Make sure we have at least 2 tries
|
||||||
|
i = 1 # Index for pass_rate_2 (0-based index)
|
||||||
|
lang_pass_rate = 100 * language_passed[language][i] / language_tests[language]
|
||||||
|
print(f" {language}_pass_rate_2: {lang_pass_rate:.1f}")
|
||||||
|
# Still store all the data in the result object for potential use in graphs
|
||||||
|
setattr(res, f"{language}_pass_rate_2", f"{lang_pass_rate:.1f}")
|
||||||
|
setattr(res, f"{language}_pass_num_2", language_passed[language][i])
|
||||||
|
setattr(res, f"{language}_tests", language_tests[language])
|
||||||
|
|
||||||
show("error_outputs")
|
show("error_outputs")
|
||||||
show("num_malformed_responses")
|
show("num_malformed_responses")
|
||||||
show("num_with_malformed_responses")
|
show("num_with_malformed_responses")
|
||||||
|
@ -658,9 +804,10 @@ def get_replayed_content(replay_dname, test_dname):
|
||||||
res = replay_fname.read_text()
|
res = replay_fname.read_text()
|
||||||
return res
|
return res
|
||||||
|
|
||||||
res = res.splitlines(keepends=True)
|
# Note: The code below is unreachable but kept for reference
|
||||||
res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
|
# res = res.splitlines(keepends=True)
|
||||||
return "".join(res)
|
# res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
|
||||||
|
# return "".join(res)
|
||||||
|
|
||||||
|
|
||||||
def run_test(original_dname, testdir, *args, **kwargs):
|
def run_test(original_dname, testdir, *args, **kwargs):
|
||||||
|
@ -939,11 +1086,19 @@ def run_test_real(
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Failed to clean up Node.js node_modules directory: {e}")
|
print(f"Failed to clean up Node.js node_modules directory: {e}")
|
||||||
|
|
||||||
|
# Get language from the testdir path
|
||||||
|
language = None
|
||||||
|
for part in testdir.parts:
|
||||||
|
if part in ["python", "javascript", "java", "cpp", "go", "rust"]:
|
||||||
|
language = part
|
||||||
|
break
|
||||||
|
|
||||||
results = dict(
|
results = dict(
|
||||||
testdir=str(testdir),
|
testdir=str(testdir),
|
||||||
testcase=testdir.name,
|
testcase=testdir.name,
|
||||||
model=main_model.name,
|
model=main_model.name,
|
||||||
edit_format=edit_format,
|
edit_format=edit_format,
|
||||||
|
language=language, # Add language information
|
||||||
tests_outcomes=test_outcomes,
|
tests_outcomes=test_outcomes,
|
||||||
cost=coder.total_cost,
|
cost=coder.total_cost,
|
||||||
duration=dur,
|
duration=dur,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue