aider/benchmark/problem_stats.py
2024-12-17 14:15:06 -08:00

113 lines
3.6 KiB
Python
Executable file

#!/usr/bin/env python
import argparse
import json
from collections import defaultdict
from pathlib import Path
import yaml
def load_results(dirname):
"""Load all result files from a benchmark directory"""
dirname = Path(dirname)
benchmark_dir = Path("tmp.benchmarks") / dirname
if not benchmark_dir.exists():
return None
all_results = []
for fname in benchmark_dir.glob("*/.aider.results.json"):
try:
results = json.loads(fname.read_text())
all_results.append(results)
except json.JSONDecodeError:
print(f"Failed to parse {fname}")
continue
return all_results
def analyze_exercise_solutions(topn=None):
# Load the leaderboard data
with open("aider/website/_data/edit_leaderboard.yml") as f:
leaderboard = yaml.safe_load(f)
# Filter out entries that don't load and sort by pass rate
valid_entries = []
for entry in leaderboard:
dirname = entry["dirname"]
results = load_results(dirname)
if results:
valid_entries.append((entry, results))
# Sort by pass rate and take top N if specified
valid_entries.sort(key=lambda x: float(x[0].get("pass_rate_2", 0)), reverse=True)
if topn:
valid_entries = valid_entries[:topn]
# Unpack the filtered and sorted entries
leaderboard = [entry for entry, _ in valid_entries]
# Get all exercise names from a complete run
all_exercises = set()
exercise_solutions = defaultdict(list)
# Find a complete run to get all exercise names
for entry in leaderboard:
dirname = entry["dirname"]
results = load_results(dirname)
if results and len(results) == 133: # Complete run
all_exercises = {result["testcase"] for result in results}
break
for entry in leaderboard:
dirname = entry["dirname"]
model = entry["model"]
results = load_results(dirname)
if not results:
print(f"Could not load results for {dirname}")
continue
for result in results:
testcase = result.get("testcase")
if not testcase:
continue
# Consider it solved if the last test attempt passed
tests_outcomes = result.get("tests_outcomes", [])
if tests_outcomes and tests_outcomes[-1]:
exercise_solutions[testcase].append(model)
# Print per-exercise statistics
print("\nExercise Solution Statistics:")
print("-" * 40)
# Add exercises that were never solved
for exercise in all_exercises:
if exercise not in exercise_solutions:
exercise_solutions[exercise] = []
# Sort by number of models that solved each exercise
sorted_exercises = sorted(exercise_solutions.items(), key=lambda x: len(x[1]), reverse=True)
# Calculate max length for alignment
max_name_len = max(len(testcase) for testcase in all_exercises)
total_models = len(leaderboard)
for testcase, models in sorted_exercises:
num_solved = len(models)
percent = (num_solved / total_models) * 100
print(f"{testcase:<{max_name_len}} : {num_solved:>3} solved ({percent:>5.1f}%)")
print("\nSummary:")
print(f"Total exercises solved at least once: {len(exercise_solutions)}")
never_solved = 133 - len(exercise_solutions)
print(f"Never solved by any model: {never_solved}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--topn", type=int, help="Only consider top N models by pass rate")
args = parser.parse_args()
analyze_exercise_solutions(args.topn)