diff --git a/benchmark/problem_stats.py b/benchmark/problem_stats.py index 375163b41..b8718a5ac 100755 --- a/benchmark/problem_stats.py +++ b/benchmark/problem_stats.py @@ -28,6 +28,8 @@ def load_results(dirname): return None all_results = [] + parse_errors = [] # Track which exercises had parse errors for this model + # Look in language subdirectories under exercises/practice for fname in benchmark_dir.glob("*/exercises/practice/*/.aider.results.json"): try: @@ -36,13 +38,21 @@ def load_results(dirname): lang = fname.parts[-5] # Get language from path results["language"] = lang all_results.append(results) + except json.JSONDecodeError: + # Track the parse error for this exercise/model combination + lang = fname.parts[-5] + exercise = f"{fname.parts[-2]}/{lang}" # Use directory name as testcase + parse_errors.append(exercise) print(f"Failed to parse {fname}") continue - return all_results + + return all_results, parse_errors def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False): + PARSE_ERROR_M = 4 # Threshold for number of parse errors to DQ an exercise + if dirs is None: # Use leaderboard data if no directories specified dir_entries = get_dirs_from_leaderboard() @@ -52,9 +62,13 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False): # Filter out entries that don't load and sort by pass rate valid_entries = [] + parse_errors_by_model = {} # Track which exercises had parse errors for each model + for dirname, model in dir_entries: - results = load_results(dirname) - if results: + results_data = load_results(dirname) + if results_data: + results, model_parse_errors = results_data + parse_errors_by_model[model] = set(model_parse_errors) # Calculate pass rate for sorting when using custom dirs if dirs is not None: pass_rate = sum( @@ -181,10 +195,30 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False): cumsum += count print(f"{i:>6d} {count:>9d} {cumsum:>10d}") + # Count parse errors per exercise + parse_error_counts = defaultdict(int) + for model_errors in parse_errors_by_model.values(): + for exercise in model_errors: + parse_error_counts[exercise] += 1 + + # Find exercises to disqualify based on parse error threshold + disqualified_exercises = { + exercise for exercise, count in parse_error_counts.items() + if count >= PARSE_ERROR_M + } + + if disqualified_exercises: + print(f"\nDisqualified {len(disqualified_exercises)} exercises with {PARSE_ERROR_M}+ parse errors:") + for ex in sorted(disqualified_exercises): + print(f" {ex} ({parse_error_counts[ex]} parse errors)") + # Collect the hard set (exercises solved by HARD_SET_NUM or fewer models) print(f"\nHard Set Analysis (exercises solved by ≤{HARD_SET_NUM} models):") print("-" * 60) - hard_set = {ex for ex, models in exercise_solutions.items() if len(models) <= HARD_SET_NUM} + hard_set = { + ex for ex, models in exercise_solutions.items() + if len(models) <= HARD_SET_NUM and ex not in disqualified_exercises + } print(f"Total hard set exercises: {len(hard_set)}") # Count total problems, unsolved problems, and hard set problems by language