diff --git a/benchmark/problem_stats.py b/benchmark/problem_stats.py
index b8718a5ac..8a11237b9 100755
--- a/benchmark/problem_stats.py
+++ b/benchmark/problem_stats.py
@@ -29,7 +29,7 @@ def load_results(dirname):
 
     all_results = []
     parse_errors = []  # Track which exercises had parse errors for this model
-    
+
     # Look in language subdirectories under exercises/practice
     for fname in benchmark_dir.glob("*/exercises/practice/*/.aider.results.json"):
         try:
@@ -38,7 +38,7 @@ def load_results(dirname):
             lang = fname.parts[-5]  # Get language from path
             results["language"] = lang
             all_results.append(results)
-            
+
         except json.JSONDecodeError:
             # Track the parse error for this exercise/model combination
             lang = fname.parts[-5]
@@ -46,7 +46,7 @@ def load_results(dirname):
             parse_errors.append(exercise)
             print(f"Failed to parse {fname}")
             continue
-            
+
     return all_results, parse_errors
 
 
@@ -63,7 +63,7 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False):
     # Filter out entries that don't load and sort by pass rate
     valid_entries = []
     parse_errors_by_model = {}  # Track which exercises had parse errors for each model
-    
+
     for dirname, model in dir_entries:
         results_data = load_results(dirname)
         if results_data:
@@ -203,12 +203,14 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False):
 
     # Find exercises to disqualify based on parse error threshold
     disqualified_exercises = {
-        exercise for exercise, count in parse_error_counts.items() 
-        if count >= PARSE_ERROR_M
+        exercise for exercise, count in parse_error_counts.items() if count >= PARSE_ERROR_M
     }
 
     if disqualified_exercises:
-        print(f"\nDisqualified {len(disqualified_exercises)} exercises with {PARSE_ERROR_M}+ parse errors:")
+        print(
+            f"\nDisqualified {len(disqualified_exercises)} exercises with {PARSE_ERROR_M}+ parse"
+            " errors:"
+        )
         for ex in sorted(disqualified_exercises):
             print(f"  {ex} ({parse_error_counts[ex]} parse errors)")
 
@@ -216,7 +218,8 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False):
     print(f"\nHard Set Analysis (exercises solved by ≤{HARD_SET_NUM} models):")
     print("-" * 60)
     hard_set = {
-        ex for ex, models in exercise_solutions.items() 
+        ex
+        for ex, models in exercise_solutions.items()
         if len(models) <= HARD_SET_NUM and ex not in disqualified_exercises
     }
     print(f"Total hard set exercises: {len(hard_set)}")