Merge 3a93da8f8d into 3caab85931

2025-05-28 16:25:00 +00:00 · 2025-05-13 15:03:58 -07:00 · 2025-05-13 15:03:58 -07:00 · e7b2514c07
commit e7b2514c07
parent 3caab85931 3a93da8f8d
2 changed files with 195 additions and 7 deletions
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -83,6 +83,21 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b
 - `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
 - `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
 - `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
+- `--resume` resume a previously paused benchmark run from its checkpoint
+- `--edit-format architect` run in architect mode, which uses two models: one to propose changes and another to implement them
+- `--editor-model` specify the model to use for implementing changes in architect mode
+- `--reasoning-effort` set reasoning effort for models that support it (e.g., "high", "medium", "low")
+
+### Pausing and Resuming Benchmarks
+
+Benchmarks can take a long time to run. You can pause a running benchmark by pressing `Ctrl+C` once. The benchmark will complete the current test and then save a checkpoint before exiting. To resume the benchmark later, use the `--resume` flag:
+
+```
+# Resume a previously paused benchmark
+./benchmark/benchmark.py YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run --resume --model gpt-3.5-turbo --edit-format whole --threads 10
+```
+
+When you resume a benchmark, it will pick up where it left off, using the list of pending tests from the checkpoint file. This allows you to run benchmarks over multiple sessions.

 ### Benchmark report

@ -137,6 +152,24 @@ should be enough to reliably reproduce any benchmark run.
 You can see examples of the benchmark report yaml in the
 [aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).

+### Running benchmarks in architect mode
+
+Architect mode uses two models: a main model that proposes changes and an editor model that implements them. This can be particularly useful for models that are good at reasoning but struggle with precise code edits.
+
+Here's an example of running a benchmark in architect mode:
+
+```
+./benchmark/benchmark.py grook-mini-architect-deepseek-editor --model openrouter/x-ai/grok-3-mini-beta --editor-model openrouter/deepseek/deepseek-chat-v3-0324 --edit-format architect --threads 15 --exercises-dir polyglot-benchmark --reasoning-effort high
+```
+
+In this example:
+- The main model is Grok-3-mini-beta (via OpenRouter)
+- The editor model is DeepSeek Chat v3 (via OpenRouter)
+- The edit format is set to "architect"
+- Reasoning effort is set to "high"
+- 15 threads are used for parallel processing
+
+When running in architect mode, the benchmark report will include additional information about the editor model used.

 ## Limitations, notes

--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -5,6 +5,7 @@ import os
 import random
 import re
 import shutil
+import signal
 import subprocess
 import sys
 import time
@ -13,7 +14,7 @@ from collections import defaultdict
 from json.decoder import JSONDecodeError
 from pathlib import Path
 from types import SimpleNamespace
-from typing import List, Optional
+from typing import Dict, List, Optional, Set

 import git
 import importlib_resources
@ -40,6 +41,61 @@ app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
 load_dotenv(override=True)


+# Global variable to track if we should pause the benchmark
+PAUSE_REQUESTED = False
+
+
+def save_checkpoint(dirname: Path, completed_tests: Set[str], pending_tests: List[str]) -> None:
+    """Save the current benchmark progress to a checkpoint file."""
+    checkpoint_file = dirname / ".benchmark_checkpoint.json"
+    checkpoint_data = {
+        "timestamp": datetime.datetime.now().isoformat(),
+        "completed_tests": list(completed_tests),
+        "pending_tests": pending_tests,
+    }
+
+    with open(checkpoint_file, "w") as f:
+        json.dump(checkpoint_data, indent=4, sort_keys=True, default=str, fp=f)
+
+    print(f"\nCheckpoint saved to {checkpoint_file}")
+    print(f"Completed: {len(completed_tests)} tests")
+    print(f"Pending: {len(pending_tests)} tests")
+
+
+def load_checkpoint(dirname: Path) -> Dict:
+    """Load the benchmark progress from a checkpoint file."""
+    checkpoint_file = dirname / ".benchmark_checkpoint.json"
+
+    if not checkpoint_file.exists():
+        return {"completed_tests": [], "pending_tests": []}
+
+    try:
+        with open(checkpoint_file) as f:
+            checkpoint_data = json.load(f)
+
+        print(f"\nLoaded checkpoint from {checkpoint_file}")
+        print(f"Checkpoint timestamp: {checkpoint_data.get('timestamp', 'unknown')}")
+        print(f"Completed: {len(checkpoint_data.get('completed_tests', []))} tests")
+        print(f"Pending: {len(checkpoint_data.get('pending_tests', []))} tests")
+
+        return checkpoint_data
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"Error loading checkpoint file: {e}")
+        return {"completed_tests": [], "pending_tests": []}
+
+
+def signal_handler(sig, frame):
+    """Handle Ctrl+C by setting the pause flag instead of terminating immediately."""
+    global PAUSE_REQUESTED
+    if PAUSE_REQUESTED:
+        print("\nForce quitting...")
+        sys.exit(1)
+    else:
+        print("\nPause requested. Will stop after current test completes...")
+        print("(Press Ctrl+C again to force quit)")
+        PAUSE_REQUESTED = True
+
+
 def find_latest_benchmark_dir():
    benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
    if not benchmark_dirs:
@ -184,6 +240,7 @@ def main(
        False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
    ),
    cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
+    resume: bool = typer.Option(False, "--resume", help="Resume from checkpoint in the matching testdir"),
    make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"),
    no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
    no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
@ -225,6 +282,9 @@ def main(
        latest_dir = find_latest_benchmark_dir()
        dirnames = [str(latest_dir)]

+    # Register signal handler for graceful pausing
+    signal.signal(signal.SIGINT, signal_handler)
+
    if dirnames is None:
        dirnames = []

@ -235,7 +295,7 @@ def main(
    updated_dirnames = []
    for dirname in dirnames:
        dirname = Path(dirname)
-        dirname = resolve_dirname(dirname, stats_only or cont, make_new)
+        dirname = resolve_dirname(dirname, stats_only or cont or resume, make_new)
        if not dirname:
            return 1
        updated_dirnames.append(dirname)
@ -340,8 +400,28 @@ def main(
        keywords = keywords.split(",")
        test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]

-    random.shuffle(test_dnames)
-    if num_tests > 0:
+    # Load checkpoint if resuming
+    checkpoint_data = {}
+    completed_tests = set()
+
+    if resume:
+        checkpoint_data = load_checkpoint(dirname)
+        completed_tests = set(checkpoint_data.get("completed_tests", []))
+
+        # If we have pending tests from a previous run, use those instead of shuffling
+        pending_tests = checkpoint_data.get("pending_tests", [])
+        if pending_tests:
+            print(f"Resuming with {len(pending_tests)} pending tests from checkpoint")
+            test_dnames = pending_tests
+        else:
+            # Filter out already completed tests
+            test_dnames = [dn for dn in test_dnames if dn not in completed_tests]
+            random.shuffle(test_dnames)
+    else:
+        # Normal operation - shuffle and limit tests
+        random.shuffle(test_dnames)
+
+    if num_tests > 0 and not resume:
        test_dnames = test_dnames[:num_tests]

    # Don't give up when benchmarking
@ -352,7 +432,15 @@ def main(

    if threads == 1:
        all_results = []
+        remaining_tests = test_dnames.copy()
+
        for test_path in test_dnames:
+            # Check if we should pause
+            if PAUSE_REQUESTED:
+                print("\nPausing benchmark as requested...")
+                save_checkpoint(dirname, completed_tests, remaining_tests)
+                return 0
+
            results = run_test(
                original_dname,
                dirname / test_path,
@ -373,10 +461,21 @@ def main(
            )

            all_results.append(results)
+
+            # Update completed and remaining tests
+            if results:
+                completed_tests.add(test_path)
+                remaining_tests.remove(test_path)
+
+                # Save checkpoint after each test
+                save_checkpoint(dirname, completed_tests, remaining_tests)
+
            summarize_results(dirname)
            if sleep:
                time.sleep(sleep)
    else:
+        # For threaded execution, we can't easily pause in the middle
+        # So we'll just run all tests and save a checkpoint at the end
        run_test_threaded = lox.thread(threads)(run_test)
        for test_path in test_dnames:
            run_test_threaded.scatter(
@ -399,11 +498,29 @@ def main(
            )
        all_results = run_test_threaded.gather(tqdm=True)

+        # Update completed tests based on results
+        for test_path, result in zip(test_dnames, all_results):
+            if result:
+                completed_tests.add(test_path)
+
    print()
    print()
    print()
    summarize_results(dirname)

+    # Save final checkpoint
+    remaining_tests = [t for t in test_dnames if t not in completed_tests]
+    save_checkpoint(dirname, completed_tests, remaining_tests)
+
+    if PAUSE_REQUESTED:
+        print("\nBenchmark paused. To resume, run:")
+        print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
+    elif not remaining_tests:
+        print("\nAll tests completed successfully!")
+    else:
+        print(f"\n{len(remaining_tests)} tests were not completed. To resume, run:")
+        print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
+
    return 0


@ -480,6 +597,11 @@ def summarize_results(dirname, stats_languages=None):

    passed_tests = [0] * tries

+    # Initialize language-specific tracking
+    languages = set()
+    language_tests = defaultdict(int)
+    language_passed = defaultdict(lambda: [0] * tries)
+
    res.completed_tests = 0
    res.duration = 0
    res.cost = 0
@ -510,6 +632,15 @@ def summarize_results(dirname, stats_languages=None):
            for i in range(len(tests_outcomes) - 1, tries):
                passed_tests[i] += 1

+        # Track language-specific results
+        language = results.get("language")
+        if language:
+            languages.add(language)
+            language_tests[language] += 1
+            if passed:
+                for i in range(len(tests_outcomes) - 1, tries):
+                    language_passed[language][i] += 1
+
        res.cost += results.get("cost", 0)
        res.duration += results.get("duration", 0)
        res.test_timeouts += results.get("test_timeouts", 0)
@ -587,6 +718,21 @@ def summarize_results(dirname, stats_languages=None):
    pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
    print(f"  percent_cases_well_formed: {pct_well_formed * 100:.1f}")

+    # Display language-specific pass rates
+    if languages:
+        # Process language-specific pass rates without breaking YAML format
+        for language in sorted(languages):
+            if language_tests[language] > 0:
+                # Only print pass rate 2 for each language
+                if tries >= 2:  # Make sure we have at least 2 tries
+                    i = 1  # Index for pass_rate_2 (0-based index)
+                    lang_pass_rate = 100 * language_passed[language][i] / language_tests[language]
+                    print(f"  {language}_pass_rate_2: {lang_pass_rate:.1f}")
+                    # Still store all the data in the result object for potential use in graphs
+                    setattr(res, f"{language}_pass_rate_2", f"{lang_pass_rate:.1f}")
+                    setattr(res, f"{language}_pass_num_2", language_passed[language][i])
+                    setattr(res, f"{language}_tests", language_tests[language])
+
    show("error_outputs")
    show("num_malformed_responses")
    show("num_with_malformed_responses")
@ -658,9 +804,10 @@ def get_replayed_content(replay_dname, test_dname):
    res = replay_fname.read_text()
    return res

-    res = res.splitlines(keepends=True)
-    res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
-    return "".join(res)
+    # Note: The code below is unreachable but kept for reference
+    # res = res.splitlines(keepends=True)
+    # res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
+    # return "".join(res)


 def run_test(original_dname, testdir, *args, **kwargs):
@ -939,11 +1086,19 @@ def run_test_real(
            if verbose:
                print(f"Failed to clean up Node.js node_modules directory: {e}")

+    # Get language from the testdir path
+    language = None
+    for part in testdir.parts:
+        if part in ["python", "javascript", "java", "cpp", "go", "rust"]:
+            language = part
+            break
+
    results = dict(
        testdir=str(testdir),
        testcase=testdir.name,
        model=main_model.name,
        edit_format=edit_format,
+        language=language,  # Add language information
        tests_outcomes=test_outcomes,
        cost=coder.total_cost,
        duration=dur,