diff --git a/benchmark/problem_stats.py b/benchmark/problem_stats.py index e69de29bb..daa971fc9 100644 --- a/benchmark/problem_stats.py +++ b/benchmark/problem_stats.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import yaml +from pathlib import Path +from collections import defaultdict +import json + +def load_results(dirname): + """Load all result files from a benchmark directory""" + dirname = Path(dirname) + benchmark_dir = Path("tmp.benchmarks") / dirname + if not benchmark_dir.exists(): + return None + + all_results = [] + for fname in benchmark_dir.glob("*/.aider.results.json"): + try: + results = json.loads(fname.read_text()) + all_results.append(results) + except json.JSONDecodeError: + print(f"Failed to parse {fname}") + continue + return all_results + +def analyze_exercise_solutions(): + # Load the leaderboard data + with open("aider/website/_data/edit_leaderboard.yml") as f: + leaderboard = yaml.safe_load(f) + + # Track which models solved each exercise + exercise_solutions = defaultdict(list) + + for entry in leaderboard: + dirname = entry["dirname"] + model = entry["model"] + + results = load_results(dirname) + if not results: + print(f"Could not load results for {dirname}") + continue + + for result in results: + testcase = result.get("testcase") + if not testcase: + continue + + # Consider it solved if the last test attempt passed + tests_outcomes = result.get("tests_outcomes", []) + if tests_outcomes and tests_outcomes[-1]: + exercise_solutions[testcase].append(model) + + # Print statistics + print("\nExercise Solution Statistics:") + print("-" * 40) + + # Sort by number of models that solved each exercise + sorted_exercises = sorted( + exercise_solutions.items(), + key=lambda x: len(x[1]), + reverse=True + ) + + for testcase, models in sorted_exercises: + print(f"{testcase}: solved by {len(models)} models") + #print(f" Models: {', '.join(models)}") + + print("\nSummary:") + print(f"Total exercises solved at least once: {len(exercise_solutions)}") + never_solved = 133 - len(exercise_solutions) + print(f"Never solved by any model: {never_solved}") + + # Distribution of solutions + solved_by_counts = defaultdict(int) + for models in exercise_solutions.values(): + solved_by_counts[len(models)] += 1 + + print("\nDistribution of solutions:") + for count in sorted(solved_by_counts.keys()): + print(f"Solved by {count} models: {solved_by_counts[count]} exercises") + +if __name__ == "__main__": + analyze_exercise_solutions()