diff --git a/benchmark/README.md b/benchmark/README.md index 7765c00b7..e269c8de1 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -83,6 +83,21 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b - `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup. - `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`). - `--read-model-settings=` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings +- `--resume` resume a previously paused benchmark run from its checkpoint +- `--edit-format architect` run in architect mode, which uses two models: one to propose changes and another to implement them +- `--editor-model` specify the model to use for implementing changes in architect mode +- `--reasoning-effort` set reasoning effort for models that support it (e.g., "high", "medium", "low") + +### Pausing and Resuming Benchmarks + +Benchmarks can take a long time to run. You can pause a running benchmark by pressing `Ctrl+C` once. The benchmark will complete the current test and then save a checkpoint before exiting. To resume the benchmark later, use the `--resume` flag: + +``` +# Resume a previously paused benchmark +./benchmark/benchmark.py YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run --resume --model gpt-3.5-turbo --edit-format whole --threads 10 +``` + +When you resume a benchmark, it will pick up where it left off, using the list of pending tests from the checkpoint file. This allows you to run benchmarks over multiple sessions. ### Benchmark report @@ -137,6 +152,24 @@ should be enough to reliably reproduce any benchmark run. You can see examples of the benchmark report yaml in the [aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/). +### Running benchmarks in architect mode + +Architect mode uses two models: a main model that proposes changes and an editor model that implements them. This can be particularly useful for models that are good at reasoning but struggle with precise code edits. + +Here's an example of running a benchmark in architect mode: + +``` +./benchmark/benchmark.py grook-mini-architect-deepseek-editor --model openrouter/x-ai/grok-3-mini-beta --editor-model openrouter/deepseek/deepseek-chat-v3-0324 --edit-format architect --threads 15 --exercises-dir polyglot-benchmark --reasoning-effort high +``` + +In this example: +- The main model is Grok-3-mini-beta (via OpenRouter) +- The editor model is DeepSeek Chat v3 (via OpenRouter) +- The edit format is set to "architect" +- Reasoning effort is set to "high" +- 15 threads are used for parallel processing + +When running in architect mode, the benchmark report will include additional information about the editor model used. ## Limitations, notes diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index bf09bafef..f68e07b58 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -5,6 +5,7 @@ import os import random import re import shutil +import signal import subprocess import sys import time @@ -13,7 +14,7 @@ from collections import defaultdict from json.decoder import JSONDecodeError from pathlib import Path from types import SimpleNamespace -from typing import List, Optional +from typing import Dict, List, Optional, Set import git import importlib_resources @@ -40,6 +41,61 @@ app = typer.Typer(add_completion=False, pretty_exceptions_enable=False) load_dotenv(override=True) +# Global variable to track if we should pause the benchmark +PAUSE_REQUESTED = False + + +def save_checkpoint(dirname: Path, completed_tests: Set[str], pending_tests: List[str]) -> None: + """Save the current benchmark progress to a checkpoint file.""" + checkpoint_file = dirname / ".benchmark_checkpoint.json" + checkpoint_data = { + "timestamp": datetime.datetime.now().isoformat(), + "completed_tests": list(completed_tests), + "pending_tests": pending_tests, + } + + with open(checkpoint_file, "w") as f: + json.dump(checkpoint_data, indent=4, sort_keys=True, default=str, fp=f) + + print(f"\nCheckpoint saved to {checkpoint_file}") + print(f"Completed: {len(completed_tests)} tests") + print(f"Pending: {len(pending_tests)} tests") + + +def load_checkpoint(dirname: Path) -> Dict: + """Load the benchmark progress from a checkpoint file.""" + checkpoint_file = dirname / ".benchmark_checkpoint.json" + + if not checkpoint_file.exists(): + return {"completed_tests": [], "pending_tests": []} + + try: + with open(checkpoint_file) as f: + checkpoint_data = json.load(f) + + print(f"\nLoaded checkpoint from {checkpoint_file}") + print(f"Checkpoint timestamp: {checkpoint_data.get('timestamp', 'unknown')}") + print(f"Completed: {len(checkpoint_data.get('completed_tests', []))} tests") + print(f"Pending: {len(checkpoint_data.get('pending_tests', []))} tests") + + return checkpoint_data + except (json.JSONDecodeError, KeyError) as e: + print(f"Error loading checkpoint file: {e}") + return {"completed_tests": [], "pending_tests": []} + + +def signal_handler(sig, frame): + """Handle Ctrl+C by setting the pause flag instead of terminating immediately.""" + global PAUSE_REQUESTED + if PAUSE_REQUESTED: + print("\nForce quitting...") + sys.exit(1) + else: + print("\nPause requested. Will stop after current test completes...") + print("(Press Ctrl+C again to force quit)") + PAUSE_REQUESTED = True + + def find_latest_benchmark_dir(): benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()] if not benchmark_dirs: @@ -184,6 +240,7 @@ def main( False, "--clean", "-c", help="Discard the existing testdir and make a clean copy" ), cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"), + resume: bool = typer.Option(False, "--resume", help="Resume from checkpoint in the matching testdir"), make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"), no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"), no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"), @@ -225,6 +282,9 @@ def main( latest_dir = find_latest_benchmark_dir() dirnames = [str(latest_dir)] + # Register signal handler for graceful pausing + signal.signal(signal.SIGINT, signal_handler) + if dirnames is None: dirnames = [] @@ -235,7 +295,7 @@ def main( updated_dirnames = [] for dirname in dirnames: dirname = Path(dirname) - dirname = resolve_dirname(dirname, stats_only or cont, make_new) + dirname = resolve_dirname(dirname, stats_only or cont or resume, make_new) if not dirname: return 1 updated_dirnames.append(dirname) @@ -340,8 +400,28 @@ def main( keywords = keywords.split(",") test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn] - random.shuffle(test_dnames) - if num_tests > 0: + # Load checkpoint if resuming + checkpoint_data = {} + completed_tests = set() + + if resume: + checkpoint_data = load_checkpoint(dirname) + completed_tests = set(checkpoint_data.get("completed_tests", [])) + + # If we have pending tests from a previous run, use those instead of shuffling + pending_tests = checkpoint_data.get("pending_tests", []) + if pending_tests: + print(f"Resuming with {len(pending_tests)} pending tests from checkpoint") + test_dnames = pending_tests + else: + # Filter out already completed tests + test_dnames = [dn for dn in test_dnames if dn not in completed_tests] + random.shuffle(test_dnames) + else: + # Normal operation - shuffle and limit tests + random.shuffle(test_dnames) + + if num_tests > 0 and not resume: test_dnames = test_dnames[:num_tests] # Don't give up when benchmarking @@ -352,7 +432,15 @@ def main( if threads == 1: all_results = [] + remaining_tests = test_dnames.copy() + for test_path in test_dnames: + # Check if we should pause + if PAUSE_REQUESTED: + print("\nPausing benchmark as requested...") + save_checkpoint(dirname, completed_tests, remaining_tests) + return 0 + results = run_test( original_dname, dirname / test_path, @@ -373,10 +461,21 @@ def main( ) all_results.append(results) + + # Update completed and remaining tests + if results: + completed_tests.add(test_path) + remaining_tests.remove(test_path) + + # Save checkpoint after each test + save_checkpoint(dirname, completed_tests, remaining_tests) + summarize_results(dirname) if sleep: time.sleep(sleep) else: + # For threaded execution, we can't easily pause in the middle + # So we'll just run all tests and save a checkpoint at the end run_test_threaded = lox.thread(threads)(run_test) for test_path in test_dnames: run_test_threaded.scatter( @@ -399,11 +498,29 @@ def main( ) all_results = run_test_threaded.gather(tqdm=True) + # Update completed tests based on results + for test_path, result in zip(test_dnames, all_results): + if result: + completed_tests.add(test_path) + print() print() print() summarize_results(dirname) + # Save final checkpoint + remaining_tests = [t for t in test_dnames if t not in completed_tests] + save_checkpoint(dirname, completed_tests, remaining_tests) + + if PAUSE_REQUESTED: + print("\nBenchmark paused. To resume, run:") + print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}") + elif not remaining_tests: + print("\nAll tests completed successfully!") + else: + print(f"\n{len(remaining_tests)} tests were not completed. To resume, run:") + print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}") + return 0 @@ -480,6 +597,11 @@ def summarize_results(dirname, stats_languages=None): passed_tests = [0] * tries + # Initialize language-specific tracking + languages = set() + language_tests = defaultdict(int) + language_passed = defaultdict(lambda: [0] * tries) + res.completed_tests = 0 res.duration = 0 res.cost = 0 @@ -510,6 +632,15 @@ def summarize_results(dirname, stats_languages=None): for i in range(len(tests_outcomes) - 1, tries): passed_tests[i] += 1 + # Track language-specific results + language = results.get("language") + if language: + languages.add(language) + language_tests[language] += 1 + if passed: + for i in range(len(tests_outcomes) - 1, tries): + language_passed[language][i] += 1 + res.cost += results.get("cost", 0) res.duration += results.get("duration", 0) res.test_timeouts += results.get("test_timeouts", 0) @@ -587,6 +718,21 @@ def summarize_results(dirname, stats_languages=None): pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}") + # Display language-specific pass rates + if languages: + # Process language-specific pass rates without breaking YAML format + for language in sorted(languages): + if language_tests[language] > 0: + # Only print pass rate 2 for each language + if tries >= 2: # Make sure we have at least 2 tries + i = 1 # Index for pass_rate_2 (0-based index) + lang_pass_rate = 100 * language_passed[language][i] / language_tests[language] + print(f" {language}_pass_rate_2: {lang_pass_rate:.1f}") + # Still store all the data in the result object for potential use in graphs + setattr(res, f"{language}_pass_rate_2", f"{lang_pass_rate:.1f}") + setattr(res, f"{language}_pass_num_2", language_passed[language][i]) + setattr(res, f"{language}_tests", language_tests[language]) + show("error_outputs") show("num_malformed_responses") show("num_with_malformed_responses") @@ -658,9 +804,10 @@ def get_replayed_content(replay_dname, test_dname): res = replay_fname.read_text() return res - res = res.splitlines(keepends=True) - res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")] - return "".join(res) + # Note: The code below is unreachable but kept for reference + # res = res.splitlines(keepends=True) + # res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")] + # return "".join(res) def run_test(original_dname, testdir, *args, **kwargs): @@ -939,11 +1086,19 @@ def run_test_real( if verbose: print(f"Failed to clean up Node.js node_modules directory: {e}") + # Get language from the testdir path + language = None + for part in testdir.parts: + if part in ["python", "javascript", "java", "cpp", "go", "rust"]: + language = part + break + results = dict( testdir=str(testdir), testcase=testdir.name, model=main_model.name, edit_format=edit_format, + language=language, # Add language information tests_outcomes=test_outcomes, cost=coder.total_cost, duration=dur,