diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index eba05ff51..e5deb1c7a 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -239,8 +239,8 @@ def main(
     dirnames: List[str] = typer.Argument(..., help="Directory names"),
     model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"),
     edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"),
-    keyword: str = typer.Option(
-        None, "--keyword", "-k", help="Only run tests that contain keyword"
+    keywords: str = typer.Option(
+        None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)"
     ),
     clean: bool = typer.Option(
         False, "--clean", "-c", help="Discard the existing testdir and make a clean copy"
@@ -311,8 +311,9 @@ def main(
 
     test_dnames = sorted(os.listdir(dirname))
 
-    if keyword:
-        test_dnames = [dn for dn in test_dnames if keyword in dn]
+    if keywords:
+        keywords = keywords.split(",")
+        test_dnames = [dn for dn in test_dnames for keyword in keywords if keyword in dn]
 
     random.shuffle(test_dnames)
     if num_tests > 0:
@@ -367,6 +368,8 @@ def show_diffs(dirnames):
 
     testcases = sorted(testcases)
 
+    unchanged = set()
+
     for testcase in testcases:
         all_outcomes = []
         for dirname in dirnames:
@@ -377,6 +380,7 @@ def show_diffs(dirnames):
             all_outcomes.append(outcomes)
 
         if len(set(all_outcomes)) == 1:
+            unchanged.add(testcase)
             continue
 
         print()
@@ -384,6 +388,11 @@ def show_diffs(dirnames):
         for outcome, dirname in zip(all_outcomes, dirnames):
             print(outcome, f"{dirname}/{testcase}/.aider.chat.history.md")
 
+    changed = set(testcases) - unchanged
+    print()
+    print("changed:", len(changed), ",".join(sorted(changed)))
+    print("unchanged:", len(unchanged), ",".join(sorted(unchanged)))
+
 
 def load_results(dirname):
     dirname = Path(dirname)