Merge 3a93da8f8d into 3caab85931

2025-05-30 01:04:59 +00:00 · 2025-05-13 15:03:58 -07:00 · 2025-05-13 15:03:58 -07:00 · e7b2514c07
commit e7b2514c07
parent 3caab85931 3a93da8f8d
2 changed files with 195 additions and 7 deletions
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -83,6 +83,21 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b
 - `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
 - `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
 - `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
 - `--resume` resume a previously paused benchmark run from its checkpoint
 - `--edit-format architect` run in architect mode, which uses two models: one to propose changes and another to implement them
 - `--editor-model` specify the model to use for implementing changes in architect mode
 - `--reasoning-effort` set reasoning effort for models that support it (e.g., "high", "medium", "low")
 ### Pausing and Resuming Benchmarks
 Benchmarks can take a long time to run. You can pause a running benchmark by pressing `Ctrl+C` once. The benchmark will complete the current test and then save a checkpoint before exiting. To resume the benchmark later, use the `--resume` flag:
 ```
 # Resume a previously paused benchmark
 ./benchmark/benchmark.py YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run --resume --model gpt-3.5-turbo --edit-format whole --threads 10
 ```
 When you resume a benchmark, it will pick up where it left off, using the list of pending tests from the checkpoint file. This allows you to run benchmarks over multiple sessions.
 ### Benchmark report
@ -137,6 +152,24 @@ should be enough to reliably reproduce any benchmark run.
 You can see examples of the benchmark report yaml in the
 [aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
 ### Running benchmarks in architect mode
 Architect mode uses two models: a main model that proposes changes and an editor model that implements them. This can be particularly useful for models that are good at reasoning but struggle with precise code edits.
 Here's an example of running a benchmark in architect mode:
 ```
 ./benchmark/benchmark.py grook-mini-architect-deepseek-editor --model openrouter/x-ai/grok-3-mini-beta --editor-model openrouter/deepseek/deepseek-chat-v3-0324 --edit-format architect --threads 15 --exercises-dir polyglot-benchmark --reasoning-effort high
 ```
 In this example:
 - The main model is Grok-3-mini-beta (via OpenRouter)
 - The editor model is DeepSeek Chat v3 (via OpenRouter)
 - The edit format is set to "architect"
 - Reasoning effort is set to "high"
 - 15 threads are used for parallel processing
 When running in architect mode, the benchmark report will include additional information about the editor model used.
 ## Limitations, notes
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -5,6 +5,7 @@ import os
 import random
 import re
 import shutil
 import signal
 import subprocess
 import sys
 import time
@ -13,7 +14,7 @@ from collections import defaultdict
 from json.decoder import JSONDecodeError
 from pathlib import Path
 from types import SimpleNamespace
-from typing import List, Optional
+from typing import Dict, List, Optional, Set
 import git
 import importlib_resources
@ -40,6 +41,61 @@ app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
 load_dotenv(override=True)
 # Global variable to track if we should pause the benchmark
 PAUSE_REQUESTED = False
 def save_checkpoint(dirname: Path, completed_tests: Set[str], pending_tests: List[str]) -> None:
    """Save the current benchmark progress to a checkpoint file."""
    checkpoint_file = dirname / ".benchmark_checkpoint.json"
    checkpoint_data = {
        "timestamp": datetime.datetime.now().isoformat(),
        "completed_tests": list(completed_tests),
        "pending_tests": pending_tests,
    }
    with open(checkpoint_file, "w") as f:
        json.dump(checkpoint_data, indent=4, sort_keys=True, default=str, fp=f)
    print(f"\nCheckpoint saved to {checkpoint_file}")
    print(f"Completed: {len(completed_tests)} tests")
    print(f"Pending: {len(pending_tests)} tests")
 def load_checkpoint(dirname: Path) -> Dict:
    """Load the benchmark progress from a checkpoint file."""
    checkpoint_file = dirname / ".benchmark_checkpoint.json"
    if not checkpoint_file.exists():
        return {"completed_tests": [], "pending_tests": []}
    try:
        with open(checkpoint_file) as f:
            checkpoint_data = json.load(f)
        print(f"\nLoaded checkpoint from {checkpoint_file}")
        print(f"Checkpoint timestamp: {checkpoint_data.get('timestamp', 'unknown')}")
        print(f"Completed: {len(checkpoint_data.get('completed_tests', []))} tests")
        print(f"Pending: {len(checkpoint_data.get('pending_tests', []))} tests")
        return checkpoint_data
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error loading checkpoint file: {e}")
        return {"completed_tests": [], "pending_tests": []}
 def signal_handler(sig, frame):
    """Handle Ctrl+C by setting the pause flag instead of terminating immediately."""
    global PAUSE_REQUESTED
    if PAUSE_REQUESTED:
        print("\nForce quitting...")
        sys.exit(1)
    else:
        print("\nPause requested. Will stop after current test completes...")
        print("(Press Ctrl+C again to force quit)")
        PAUSE_REQUESTED = True
 def find_latest_benchmark_dir():
    benchmark_dirs = [d for d in BENCHMARK_DNAME.iterdir() if d.is_dir()]
    if not benchmark_dirs:
@ -184,6 +240,7 @@ def main(
        False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
    ),
    cont: bool = typer.Option(False, "--cont", help="Continue the (single) matching testdir"),
    resume: bool = typer.Option(False, "--resume", help="Resume from checkpoint in the matching testdir"),
    make_new: bool = typer.Option(False, "--new", "-n", help="Make a new dated testdir"),
    no_unit_tests: bool = typer.Option(False, "--no-unit-tests", help="Do not run unit tests"),
    no_aider: bool = typer.Option(False, "--no-aider", help="Do not run aider"),
@ -225,6 +282,9 @@ def main(
        latest_dir = find_latest_benchmark_dir()
        dirnames = [str(latest_dir)]
    # Register signal handler for graceful pausing
    signal.signal(signal.SIGINT, signal_handler)
    if dirnames is None:
        dirnames = []
@ -235,7 +295,7 @@ def main(
    updated_dirnames = []
    for dirname in dirnames:
        dirname = Path(dirname)
-        dirname = resolve_dirname(dirname, stats_only or cont, make_new)
+        dirname = resolve_dirname(dirname, stats_only or cont or resume, make_new)
        if not dirname:
            return 1
        updated_dirnames.append(dirname)
@ -340,8 +400,28 @@ def main(
        keywords = keywords.split(",")
        test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
-    random.shuffle(test_dnames)
+    # Load checkpoint if resuming
-    if num_tests > 0:
+    checkpoint_data = {}
    completed_tests = set()
    if resume:
        checkpoint_data = load_checkpoint(dirname)
        completed_tests = set(checkpoint_data.get("completed_tests", []))
        # If we have pending tests from a previous run, use those instead of shuffling
        pending_tests = checkpoint_data.get("pending_tests", [])
        if pending_tests:
            print(f"Resuming with {len(pending_tests)} pending tests from checkpoint")
            test_dnames = pending_tests
        else:
            # Filter out already completed tests
            test_dnames = [dn for dn in test_dnames if dn not in completed_tests]
            random.shuffle(test_dnames)
    else:
        # Normal operation - shuffle and limit tests
        random.shuffle(test_dnames)
    if num_tests > 0 and not resume:
        test_dnames = test_dnames[:num_tests]
    # Don't give up when benchmarking
@ -352,7 +432,15 @@ def main(
    if threads == 1:
        all_results = []
        remaining_tests = test_dnames.copy()
        for test_path in test_dnames:
            # Check if we should pause
            if PAUSE_REQUESTED:
                print("\nPausing benchmark as requested...")
                save_checkpoint(dirname, completed_tests, remaining_tests)
                return 0
            results = run_test(
                original_dname,
                dirname / test_path,
@ -373,10 +461,21 @@ def main(
            )
            all_results.append(results)
            # Update completed and remaining tests
            if results:
                completed_tests.add(test_path)
                remaining_tests.remove(test_path)
                # Save checkpoint after each test
                save_checkpoint(dirname, completed_tests, remaining_tests)
            summarize_results(dirname)
            if sleep:
                time.sleep(sleep)
    else:
        # For threaded execution, we can't easily pause in the middle
        # So we'll just run all tests and save a checkpoint at the end
        run_test_threaded = lox.thread(threads)(run_test)
        for test_path in test_dnames:
            run_test_threaded.scatter(
@ -399,11 +498,29 @@ def main(
            )
        all_results = run_test_threaded.gather(tqdm=True)
        # Update completed tests based on results
        for test_path, result in zip(test_dnames, all_results):
            if result:
                completed_tests.add(test_path)
    print()
    print()
    print()
    summarize_results(dirname)
    # Save final checkpoint
    remaining_tests = [t for t in test_dnames if t not in completed_tests]
    save_checkpoint(dirname, completed_tests, remaining_tests)
    if PAUSE_REQUESTED:
        print("\nBenchmark paused. To resume, run:")
        print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
    elif not remaining_tests:
        print("\nAll tests completed successfully!")
    else:
        print(f"\n{len(remaining_tests)} tests were not completed. To resume, run:")
        print(f"./benchmark/benchmark.py {dirname.name} --resume --model {model} --edit-format {edit_format} --threads {threads}")
    return 0
@ -480,6 +597,11 @@ def summarize_results(dirname, stats_languages=None):
    passed_tests = [0] * tries
    # Initialize language-specific tracking
    languages = set()
    language_tests = defaultdict(int)
    language_passed = defaultdict(lambda: [0] * tries)
    res.completed_tests = 0
    res.duration = 0
    res.cost = 0
@ -510,6 +632,15 @@ def summarize_results(dirname, stats_languages=None):
            for i in range(len(tests_outcomes) - 1, tries):
                passed_tests[i] += 1
        # Track language-specific results
        language = results.get("language")
        if language:
            languages.add(language)
            language_tests[language] += 1
            if passed:
                for i in range(len(tests_outcomes) - 1, tries):
                    language_passed[language][i] += 1
        res.cost += results.get("cost", 0)
        res.duration += results.get("duration", 0)
        res.test_timeouts += results.get("test_timeouts", 0)
@ -587,6 +718,21 @@ def summarize_results(dirname, stats_languages=None):
    pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
    print(f"  percent_cases_well_formed: {pct_well_formed * 100:.1f}")
    # Display language-specific pass rates
    if languages:
        # Process language-specific pass rates without breaking YAML format
        for language in sorted(languages):
            if language_tests[language] > 0:
                # Only print pass rate 2 for each language
                if tries >= 2:  # Make sure we have at least 2 tries
                    i = 1  # Index for pass_rate_2 (0-based index)
                    lang_pass_rate = 100 * language_passed[language][i] / language_tests[language]
                    print(f"  {language}_pass_rate_2: {lang_pass_rate:.1f}")
                    # Still store all the data in the result object for potential use in graphs
                    setattr(res, f"{language}_pass_rate_2", f"{lang_pass_rate:.1f}")
                    setattr(res, f"{language}_pass_num_2", language_passed[language][i])
                    setattr(res, f"{language}_tests", language_tests[language])
    show("error_outputs")
    show("num_malformed_responses")
    show("num_with_malformed_responses")
@ -658,9 +804,10 @@ def get_replayed_content(replay_dname, test_dname):
    res = replay_fname.read_text()
    return res
-    res = res.splitlines(keepends=True)
+    # Note: The code below is unreachable but kept for reference
-    res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
+    # res = res.splitlines(keepends=True)
-    return "".join(res)
+    # res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
    # return "".join(res)
 def run_test(original_dname, testdir, *args, **kwargs):
@ -939,11 +1086,19 @@ def run_test_real(
            if verbose:
                print(f"Failed to clean up Node.js node_modules directory: {e}")
    # Get language from the testdir path
    language = None
    for part in testdir.parts:
        if part in ["python", "javascript", "java", "cpp", "go", "rust"]:
            language = part
            break
    results = dict(
        testdir=str(testdir),
        testcase=testdir.name,
        model=main_model.name,
        edit_format=edit_format,
        language=language,  # Add language information
        tests_outcomes=test_outcomes,
        cost=coder.total_cost,
        duration=dur,