diff --git a/benchmark/problem_stats.py b/benchmark/problem_stats.py index adbf9721f..840094cb3 100755 --- a/benchmark/problem_stats.py +++ b/benchmark/problem_stats.py @@ -118,10 +118,12 @@ def analyze_exercise_solutions(dirs=None, topn=None): if exercise not in exercise_solutions: exercise_solutions[exercise] = [] - # Group exercises by language - by_language = defaultdict(list) + # Create list of (language, exercise) pairs with solution stats + exercise_stats = [] + total_models = len(valid_entries) + for testcase in all_exercises: - # Find language for this testcase from results + # Find language for this testcase lang = "unknown" for r in next(iter(valid_entries))[1]: try: @@ -130,26 +132,22 @@ def analyze_exercise_solutions(dirs=None, topn=None): break except KeyError: continue - by_language[lang].append(testcase) + + models = exercise_solutions[testcase] + num_solved = len(models) + percent = (num_solved / total_models) * 100 + exercise_stats.append((lang, testcase, num_solved, percent)) - # Sort languages - sorted_languages = sorted(by_language.keys()) + # Sort all exercises by solve rate + exercise_stats.sort(key=lambda x: x[2], reverse=True) # Calculate max lengths for alignment - max_name_len = max(len(testcase) for testcase in all_exercises) - total_models = len(valid_entries) + max_name_len = max(len(f"{lang}/{ex}") for lang, ex, _, _ in exercise_stats) - # Print exercises grouped by language - for lang in sorted_languages: - print(f"\n{lang.upper()}:") - lang_exercises = [(ex, exercise_solutions[ex]) for ex in by_language[lang]] - # Sort by number of models that solved each exercise - lang_exercises.sort(key=lambda x: len(x[1]), reverse=True) - - for i, (testcase, models) in enumerate(lang_exercises, 1): - num_solved = len(models) - percent = (num_solved / total_models) * 100 - print(f"{i:>3}. {testcase:<{max_name_len}} : {num_solved:>3} solved ({percent:>5.1f}%)") + # Print all exercises sorted by solve rate + print("\nAll Exercises (sorted by solve rate):") + for i, (lang, testcase, num_solved, percent) in enumerate(exercise_stats, 1): + print(f"{i:>3}. {lang}/{testcase:<{max_name_len}} : {num_solved:>3} solved ({percent:>5.1f}%)") print("\nSummary:") solved_at_least_once = len([ex for ex, models in exercise_solutions.items() if models])