style: Run linter on benchmark script

This commit is contained in:
Paul Gauthier (aider) 2024-12-17 14:06:56 -08:00
parent 9cc674c283
commit 7bfc2e0e74

View file

@ -1,9 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
import yaml
from pathlib import Path
from collections import defaultdict
import json import json
from collections import defaultdict
from pathlib import Path
import yaml
def load_results(dirname): def load_results(dirname):
"""Load all result files from a benchmark directory""" """Load all result files from a benchmark directory"""
@ -11,7 +13,7 @@ def load_results(dirname):
benchmark_dir = Path("tmp.benchmarks") / dirname benchmark_dir = Path("tmp.benchmarks") / dirname
if not benchmark_dir.exists(): if not benchmark_dir.exists():
return None return None
all_results = [] all_results = []
for fname in benchmark_dir.glob("*/.aider.results.json"): for fname in benchmark_dir.glob("*/.aider.results.json"):
try: try:
@ -22,61 +24,59 @@ def load_results(dirname):
continue continue
return all_results return all_results
def analyze_exercise_solutions(): def analyze_exercise_solutions():
# Load the leaderboard data # Load the leaderboard data
with open("aider/website/_data/edit_leaderboard.yml") as f: with open("aider/website/_data/edit_leaderboard.yml") as f:
leaderboard = yaml.safe_load(f) leaderboard = yaml.safe_load(f)
# Track which models solved each exercise # Track which models solved each exercise
exercise_solutions = defaultdict(list) exercise_solutions = defaultdict(list)
for entry in leaderboard: for entry in leaderboard:
dirname = entry["dirname"] dirname = entry["dirname"]
model = entry["model"] model = entry["model"]
results = load_results(dirname) results = load_results(dirname)
if not results: if not results:
print(f"Could not load results for {dirname}") print(f"Could not load results for {dirname}")
continue continue
for result in results: for result in results:
testcase = result.get("testcase") testcase = result.get("testcase")
if not testcase: if not testcase:
continue continue
# Consider it solved if the last test attempt passed # Consider it solved if the last test attempt passed
tests_outcomes = result.get("tests_outcomes", []) tests_outcomes = result.get("tests_outcomes", [])
if tests_outcomes and tests_outcomes[-1]: if tests_outcomes and tests_outcomes[-1]:
exercise_solutions[testcase].append(model) exercise_solutions[testcase].append(model)
# Print statistics # Print statistics
print("\nExercise Solution Statistics:") print("\nExercise Solution Statistics:")
print("-" * 40) print("-" * 40)
# Sort by number of models that solved each exercise # Sort by number of models that solved each exercise
sorted_exercises = sorted( sorted_exercises = sorted(exercise_solutions.items(), key=lambda x: len(x[1]), reverse=True)
exercise_solutions.items(),
key=lambda x: len(x[1]),
reverse=True
)
for testcase, models in sorted_exercises: for testcase, models in sorted_exercises:
print(f"{testcase}: solved by {len(models)} models") print(f"{testcase}: solved by {len(models)} models")
#print(f" Models: {', '.join(models)}") # print(f" Models: {', '.join(models)}")
print("\nSummary:") print("\nSummary:")
print(f"Total exercises solved at least once: {len(exercise_solutions)}") print(f"Total exercises solved at least once: {len(exercise_solutions)}")
never_solved = 133 - len(exercise_solutions) never_solved = 133 - len(exercise_solutions)
print(f"Never solved by any model: {never_solved}") print(f"Never solved by any model: {never_solved}")
# Distribution of solutions # Distribution of solutions
solved_by_counts = defaultdict(int) solved_by_counts = defaultdict(int)
for models in exercise_solutions.values(): for models in exercise_solutions.values():
solved_by_counts[len(models)] += 1 solved_by_counts[len(models)] += 1
print("\nDistribution of solutions:") print("\nDistribution of solutions:")
for count in sorted(solved_by_counts.keys()): for count in sorted(solved_by_counts.keys()):
print(f"Solved by {count} models: {solved_by_counts[count]} exercises") print(f"Solved by {count} models: {solved_by_counts[count]} exercises")
if __name__ == "__main__": if __name__ == "__main__":
analyze_exercise_solutions() analyze_exercise_solutions()