This commit is contained in:
AJ (@techfren) 2025-05-13 15:03:58 -07:00 committed by GitHub
commit e7b2514c07
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 195 additions and 7 deletions

View file

@ -83,6 +83,21 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b
- `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
- `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
- `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
- `--resume` resume a previously paused benchmark run from its checkpoint
- `--edit-format architect` run in architect mode, which uses two models: one to propose changes and another to implement them
- `--editor-model` specify the model to use for implementing changes in architect mode
- `--reasoning-effort` set reasoning effort for models that support it (e.g., "high", "medium", "low")
### Pausing and Resuming Benchmarks
Benchmarks can take a long time to run. You can pause a running benchmark by pressing `Ctrl+C` once. The benchmark will complete the current test and then save a checkpoint before exiting. To resume the benchmark later, use the `--resume` flag:
```
# Resume a previously paused benchmark
./benchmark/benchmark.py YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run --resume --model gpt-3.5-turbo --edit-format whole --threads 10
```
When you resume a benchmark, it will pick up where it left off, using the list of pending tests from the checkpoint file. This allows you to run benchmarks over multiple sessions.
### Benchmark report
@ -137,6 +152,24 @@ should be enough to reliably reproduce any benchmark run.
You can see examples of the benchmark report yaml in the
[aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
### Running benchmarks in architect mode
Architect mode uses two models: a main model that proposes changes and an editor model that implements them. This can be particularly useful for models that are good at reasoning but struggle with precise code edits.
Here's an example of running a benchmark in architect mode:
```
./benchmark/benchmark.py grook-mini-architect-deepseek-editor --model openrouter/x-ai/grok-3-mini-beta --editor-model openrouter/deepseek/deepseek-chat-v3-0324 --edit-format architect --threads 15 --exercises-dir polyglot-benchmark --reasoning-effort high
```
In this example:
- The main model is Grok-3-mini-beta (via OpenRouter)
- The editor model is DeepSeek Chat v3 (via OpenRouter)
- The edit format is set to "architect"
- Reasoning effort is set to "high"
- 15 threads are used for parallel processing
When running in architect mode, the benchmark report will include additional information about the editor model used.
## Limitations, notes

View file

@ -5,6 +5,7 @@ import os
import random
import re
import shutil
import signal
import subprocess
import sys
import time
@ -13,7 +14,7 @@ from collections import defaultdict
from json.decoder import JSONDecodeError
from pathlib import Path
from types import SimpleNamespace
from typing import List, Optional
from typing import Dict, List, Optional, Set
import git
import importlib_resources
@ -40,6 +41,61 @@ app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
load_dotenv(override=True)
# Global variable to track if we should pause the benchmark
PAUSE_REQUESTED = False
def save_checkpoint(dirname: Path, completed_tests: Set[str], pending_tests: List[str]) -> None:
"""Save the current benchmark progress to a checkpoint file."""
checkpoint_file = dirname / ".benchmark_checkpoint.json"
checkpoint_data = {
"timestamp": datetime.datetime.now().isoformat(),
"completed_tests": list(completed_tests),
"pending_tests": pending_tests,
}
with open(checkpoint_file, "w") as f:
json.dump(checkpoint_data, indent=4, sort_keys=True, default=str, fp=f)
print(f"\nCheckpoint saved to {checkpoint_file}")
print(f"Completed: {len(completed_tests)} tests")
print(f"Pending: {len(pending_tests)} tests")
def load_checkpoint(dirname: Path) -> Dict:
"""Load the benchmark progress from a checkpoint file."""
checkpoint_file = dirname / ".benchmark_checkpoint.json"
if not checkpoint_file.exists():
return {"completed_tests": [], "pending_tests": []}
try:
with open(checkpoint_file) as f:
checkpoint_data = json.load(f)
print(f"\nLoaded checkpoint from {checkpoint_file}")
print(f"Checkpoint timestamp: {checkpoint_data.get('timestamp', 'unknown')}")
print(f"Completed: {len(checkpoint_data.get('completed_tests', []))} tests")
print(f"Pending: {len(checkpoint_data.get('pending_tests', []))} tests")
return checkpoint_data
except (json.JSONDecodeError, KeyError) as e:
print(f"Error loading checkpoint file: {e}")
return {"completed_tests": [], "pending_tests": []}
def signal_handler(sig, frame):
"""Handle Ctrl+C by setting the pause flag instead of terminating immediately."""
global PAUSE_REQUESTED
if PAUSE_REQUESTED:
print("\nForce quitting...")
sys.exit(1)
else:
print("\nPause requested. Will stop after current test completes...")
print("(Press Ctrl+C again to force quit)")
PAUSE_REQUESTED = True
def find_latest_benchmark_dir():
benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
if not benchmark_dirs:
@ -184,6 +240,7 @@ def main(
False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
),
cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
resume: bool = typer.Option(False, "--resume", help="Resume from checkpoint in the matching testdir"),
make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"),
no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
@ -225,6 +282,9 @@ def main(
latest_dir = find_latest_benchmark_dir()
dirnames = [str(latest_dir)]
# Register signal handler for graceful pausing
signal.signal(signal.SIGINT, signal_handler)
if dirnames is None:
dirnames = []
@ -235,7 +295,7 @@ def main(
updated_dirnames = []
for dirname in dirnames:
dirname = Path(dirname)
dirname = resolve_dirname(dirname, stats_only or cont, make_new)
dirname = resolve_dirname(dirname, stats_only or cont or resume, make_new)
if not dirname:
return 1
updated_dirnames.append(dirname)
@ -340,8 +400,28 @@ def main(
keywords = keywords.split(",")
test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
random.shuffle(test_dnames)
if num_tests > 0:
# Load checkpoint if resuming
checkpoint_data = {}
completed_tests = set()
if resume:
checkpoint_data = load_checkpoint(dirname)
completed_tests = set(checkpoint_data.get("completed_tests", []))
# If we have pending tests from a previous run, use those instead of shuffling
pending_tests = checkpoint_data.get("pending_tests", [])
if pending_tests:
print(f"Resuming with {len(pending_tests)} pending tests from checkpoint")
test_dnames = pending_tests
else:
# Filter out already completed tests
test_dnames = [dn for dn in test_dnames if dn not in completed_tests]
random.shuffle(test_dnames)
else:
# Normal operation - shuffle and limit tests
random.shuffle(test_dnames)
if num_tests > 0 and not resume:
test_dnames = test_dnames[:num_tests]
# Don't give up when benchmarking
@ -352,7 +432,15 @@ def main(
if threads == 1:
all_results = []
remaining_tests = test_dnames.copy()
for test_path in test_dnames:
# Check if we should pause
if PAUSE_REQUESTED:
print("\nPausing benchmark as requested...")
save_checkpoint(dirname, completed_tests, remaining_tests)
return 0
results = run_test(
original_dname,
dirname / test_path,
@ -373,10 +461,21 @@ def main(
)
all_results.append(results)
# Update completed and remaining tests
if results:
completed_tests.add(test_path)
remaining_tests.remove(test_path)
# Save checkpoint after each test
save_checkpoint(dirname, completed_tests, remaining_tests)
summarize_results(dirname)
if sleep:
time.sleep(sleep)
else:
# For threaded execution, we can't easily pause in the middle
# So we'll just run all tests and save a checkpoint at the end
run_test_threaded = lox.thread(threads)(run_test)
for test_path in test_dnames:
run_test_threaded.scatter(
@ -399,11 +498,29 @@ def main(
)
all_results = run_test_threaded.gather(tqdm=True)
# Update completed tests based on results
for test_path, result in zip(test_dnames, all_results):
if result:
completed_tests.add(test_path)
print()
print()
print()
summarize_results(dirname)
# Save final checkpoint
remaining_tests = [t for t in test_dnames if t not in completed_tests]
save_checkpoint(dirname, completed_tests, remaining_tests)
if PAUSE_REQUESTED:
print("\nBenchmark paused. To resume, run:")
print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
elif not remaining_tests:
print("\nAll tests completed successfully!")
else:
print(f"\n{len(remaining_tests)} tests were not completed. To resume, run:")
print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
return 0
@ -480,6 +597,11 @@ def summarize_results(dirname, stats_languages=None):
passed_tests = [0] * tries
# Initialize language-specific tracking
languages = set()
language_tests = defaultdict(int)
language_passed = defaultdict(lambda: [0] * tries)
res.completed_tests = 0
res.duration = 0
res.cost = 0
@ -510,6 +632,15 @@ def summarize_results(dirname, stats_languages=None):
for i in range(len(tests_outcomes) - 1, tries):
passed_tests[i] += 1
# Track language-specific results
language = results.get("language")
if language:
languages.add(language)
language_tests[language] += 1
if passed:
for i in range(len(tests_outcomes) - 1, tries):
language_passed[language][i] += 1
res.cost += results.get("cost", 0)
res.duration += results.get("duration", 0)
res.test_timeouts += results.get("test_timeouts", 0)
@ -587,6 +718,21 @@ def summarize_results(dirname, stats_languages=None):
pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}")
# Display language-specific pass rates
if languages:
# Process language-specific pass rates without breaking YAML format
for language in sorted(languages):
if language_tests[language] > 0:
# Only print pass rate 2 for each language
if tries >= 2: # Make sure we have at least 2 tries
i = 1 # Index for pass_rate_2 (0-based index)
lang_pass_rate = 100 * language_passed[language][i] / language_tests[language]
print(f" {language}_pass_rate_2: {lang_pass_rate:.1f}")
# Still store all the data in the result object for potential use in graphs
setattr(res, f"{language}_pass_rate_2", f"{lang_pass_rate:.1f}")
setattr(res, f"{language}_pass_num_2", language_passed[language][i])
setattr(res, f"{language}_tests", language_tests[language])
show("error_outputs")
show("num_malformed_responses")
show("num_with_malformed_responses")
@ -658,9 +804,10 @@ def get_replayed_content(replay_dname, test_dname):
res = replay_fname.read_text()
return res
res = res.splitlines(keepends=True)
res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
return "".join(res)
# Note: The code below is unreachable but kept for reference
# res = res.splitlines(keepends=True)
# res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
# return "".join(res)
def run_test(original_dname, testdir, *args, **kwargs):
@ -939,11 +1086,19 @@ def run_test_real(
if verbose:
print(f"Failed to clean up Node.js node_modules directory: {e}")
# Get language from the testdir path
language = None
for part in testdir.parts:
if part in ["python", "javascript", "java", "cpp", "go", "rust"]:
language = part
break
results = dict(
testdir=str(testdir),
testcase=testdir.name,
model=main_model.name,
edit_format=edit_format,
language=language, # Add language information
tests_outcomes=test_outcomes,
cost=coder.total_cost,
duration=dur,