diff --git a/benchmark/problem_stats.py b/benchmark/problem_stats.py index 75a76728d..dbd3004ef 100755 --- a/benchmark/problem_stats.py +++ b/benchmark/problem_stats.py @@ -3,6 +3,9 @@ import argparse import json from collections import defaultdict +from typing import List, Optional + +HARD_SET_NUM = 4 # Number of models that defines the hard set threshold from pathlib import Path import yaml @@ -178,10 +181,10 @@ def analyze_exercise_solutions(dirs=None, topn=None): cumsum += count print(f"{i:>6d} {count:>9d} {cumsum:>10d}") - # Collect the hard set (exercises solved by 4 or fewer models) - print("\nHard Set Analysis (exercises solved by ≤4 models):") + # Collect the hard set (exercises solved by HARD_SET_NUM or fewer models) + print(f"\nHard Set Analysis (exercises solved by ≤{HARD_SET_NUM} models):") print("-" * 60) - hard_set = {ex for ex, models in exercise_solutions.items() if len(models) <= 4} + hard_set = {ex for ex, models in exercise_solutions.items() if len(models) <= HARD_SET_NUM} print(f"Total hard set exercises: {len(hard_set)}") # For each model, compute performance on hard set