aider/benchmark/problem_stats.py
2024-12-17 14:09:17 -08:00

71 lines
2.1 KiB
Python
Executable file

#!/usr/bin/env python
import json
from collections import defaultdict
from pathlib import Path
import yaml
def load_results(dirname):
"""Load all result files from a benchmark directory"""
dirname = Path(dirname)
benchmark_dir = Path("tmp.benchmarks") / dirname
if not benchmark_dir.exists():
return None
all_results = []
for fname in benchmark_dir.glob("*/.aider.results.json"):
try:
results = json.loads(fname.read_text())
all_results.append(results)
except json.JSONDecodeError:
print(f"Failed to parse {fname}")
continue
return all_results
def analyze_exercise_solutions():
# Load the leaderboard data
with open("aider/website/_data/edit_leaderboard.yml") as f:
leaderboard = yaml.safe_load(f)
# Track which models solved each exercise
exercise_solutions = defaultdict(list)
for entry in leaderboard:
dirname = entry["dirname"]
model = entry["model"]
results = load_results(dirname)
if not results:
print(f"Could not load results for {dirname}")
continue
for result in results:
testcase = result.get("testcase")
if not testcase:
continue
# Consider it solved if the last test attempt passed
tests_outcomes = result.get("tests_outcomes", [])
if tests_outcomes and tests_outcomes[-1]:
exercise_solutions[testcase].append(model)
# Print per-exercise statistics
print("\nExercise Solution Statistics:")
print("-" * 40)
# Sort by number of models that solved each exercise
sorted_exercises = sorted(exercise_solutions.items(), key=lambda x: len(x[1]), reverse=True)
for testcase, models in sorted_exercises:
print(f"{testcase}: {len(models)} solved")
print("\nSummary:")
print(f"Total exercises solved at least once: {len(exercise_solutions)}")
never_solved = 133 - len(exercise_solutions)
print(f"Never solved by any model: {never_solved}")
if __name__ == "__main__":
analyze_exercise_solutions()