From 821f7d669445ac83219b13c5693ddb5507c7120c Mon Sep 17 00:00:00 2001
From: Paul Gauthier <paul@aider.chat>
Date: Thu, 19 Dec 2024 07:10:20 -0800
Subject: [PATCH] fix: Use extra_body for reasoning_effort, fix test counts

---
 aider/models.py        |  2 +-
 benchmark/benchmark.py | 38 +++++++++++++++++++++++++-------------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/aider/models.py b/aider/models.py
index c8dfa3ea1..52f9a8e47 100644
--- a/aider/models.py
+++ b/aider/models.py
@@ -773,7 +773,7 @@ MODEL_SETTINGS = [
         use_repo_map=True,
         streaming=False,
         use_temperature=False,
-        extra_params=dict(reasoning_effort="high"),
+        extra_params=dict(extra_body=dict(reasoning_effort="high")),
     ),
     ModelSettings(
         "openrouter/qwen/qwen-2.5-coder-32b-instruct",
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 11541c3c5..1a59e4ac8 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -36,8 +36,6 @@ EXERCISES_DIR_DEFAULT = "exercism-python"
 app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
 
 
-NUM_TESTS = (89, 133)
-
 load_dotenv(override=True)
 
 
@@ -103,8 +101,10 @@ def show_stats(dirnames, graphs):
         if not row:
             continue
 
-        if row.completed_tests not in NUM_TESTS:
-            print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
+        if row.completed_tests != row.total_tests:
+            print(
+                f"Warning: {row.dir_name} is incomplete: {row.completed_tests} of {row.total_tests}"
+            )
 
         kind = (row.model, row.edit_format)
         if kind in seen:
@@ -509,7 +509,7 @@ def summarize_results(dirname):
         setattr(res, f"pass_num_{i + 1}", passed_tests[i])
 
     print(f"- dirname: {dirname.name}")
-    style = None if res.completed_tests in NUM_TESTS else "red"
+    style = None if res.completed_tests == res.total_tests else "red"
     console.print(f"  test_cases: {res.completed_tests}", style=style)
     for key, val in variants.items():
         if len(val) > 1:
@@ -537,7 +537,7 @@ def summarize_results(dirname):
     show("indentation_errors")
     show("exhausted_context_windows")
     show("test_timeouts")
-    show("total_tests")
+    print(f"  total_tests: {res.total_tests}")
 
     a_model = set(variants["model"]).pop()
     command = f"aider --model {a_model}"
@@ -660,14 +660,25 @@ def run_test_real(
     with open(config_file) as f:
         config = json.loads(f.read())
 
-    # Get solution and test files from config
-    solution_files = set(config.get("files", {}).get("solution", []))
-    solution_files.discard("Cargo.toml")
-
+    # Get file sets from config
     test_files = config.get("files", {}).get("test", [])
+    example_files = config.get("files", {}).get("example", [])
+    solution_files = set(config.get("files", {}).get("solution", []))
 
-    ignore_files = set(["Cargo.toml"])
+    # Forcibly ignore certain files not covered by test_files and example_files
+    ignore_files = set(
+        [
+            "CMakeLists.txt",
+            "Cargo.toml",
+        ]
+    )
+
+    # Also ignore test & example files
     ignore_files.update(test_files)
+    ignore_files.update(example_files)
+
+    # Remove any ignore files from the solution set that LLM will edit
+    solution_files.discard(ignore_files)
 
     # Copy all solution files
     for file_path in solution_files:
@@ -868,10 +879,11 @@ def run_unit_tests(original_dname, testdir, history_fname, test_files):
     # Map of file extensions to test commands
     TEST_COMMANDS = {
         ".py": ["pytest"],
-        ".rs": ["cargo", "test", "--offline", "--", "--include-ignored"],
-        ".cs": ["dotnet", "test"],
+        ".rs": ["cargo", "test", "--", "--include-ignored"],
         ".go": ["go", "test", "./..."],
         ".js": ["/aider/benchmark/npm-test.sh"],
+        ".cpp": ["/aider/benchmark/cpp-test.sh"],
+        ".java": ["./gradlew", "test"],
     }
 
     # Get unique file extensions from test files