mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-24 14:25:00 +00:00
feat: Allow specifying dirs on cmd line for problem_stats
This commit is contained in:
parent
2aa4615c78
commit
a19f1fbc67
1 changed files with 29 additions and 21 deletions
|
@ -8,6 +8,13 @@ from pathlib import Path
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
def get_dirs_from_leaderboard():
|
||||||
|
# Load the leaderboard data
|
||||||
|
with open("aider/website/_data/edit_leaderboard.yml") as f:
|
||||||
|
leaderboard = yaml.safe_load(f)
|
||||||
|
return [(entry["dirname"], entry["model"]) for entry in leaderboard]
|
||||||
|
|
||||||
|
|
||||||
def load_results(dirname):
|
def load_results(dirname):
|
||||||
"""Load all result files from a benchmark directory"""
|
"""Load all result files from a benchmark directory"""
|
||||||
dirname = Path(dirname)
|
dirname = Path(dirname)
|
||||||
|
@ -26,44 +33,44 @@ def load_results(dirname):
|
||||||
return all_results
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
def analyze_exercise_solutions(topn=None):
|
def analyze_exercise_solutions(dirs=None, topn=None):
|
||||||
# Load the leaderboard data
|
if dirs is None:
|
||||||
with open("aider/website/_data/edit_leaderboard.yml") as f:
|
# Use leaderboard data if no directories specified
|
||||||
leaderboard = yaml.safe_load(f)
|
dir_entries = get_dirs_from_leaderboard()
|
||||||
|
else:
|
||||||
|
# Use provided directories, with dirname as model name
|
||||||
|
dir_entries = [(d, d) for d in dirs]
|
||||||
|
|
||||||
# Filter out entries that don't load and sort by pass rate
|
# Filter out entries that don't load and sort by pass rate
|
||||||
valid_entries = []
|
valid_entries = []
|
||||||
for entry in leaderboard:
|
for dirname, model in dir_entries:
|
||||||
dirname = entry["dirname"]
|
|
||||||
results = load_results(dirname)
|
results = load_results(dirname)
|
||||||
if results:
|
if results:
|
||||||
valid_entries.append((entry, results))
|
# Calculate pass rate for sorting when using custom dirs
|
||||||
|
if dirs is not None:
|
||||||
|
pass_rate = sum(1 for r in results if r.get("tests_outcomes", []) and r["tests_outcomes"][-1]) / len(results)
|
||||||
|
else:
|
||||||
|
# Use existing pass rate from leaderboard
|
||||||
|
pass_rate = next((entry["pass_rate_2"] for entry in yaml.safe_load(open("aider/website/_data/edit_leaderboard.yml"))
|
||||||
|
if entry["dirname"] == dirname), 0)
|
||||||
|
valid_entries.append(((dirname, model), results, float(pass_rate)))
|
||||||
|
|
||||||
# Sort by pass rate and take top N if specified
|
# Sort by pass rate and take top N if specified
|
||||||
valid_entries.sort(key=lambda x: float(x[0].get("pass_rate_2", 0)), reverse=True)
|
valid_entries.sort(key=lambda x: x[2], reverse=True)
|
||||||
if topn:
|
if topn:
|
||||||
valid_entries = valid_entries[:topn]
|
valid_entries = valid_entries[:topn]
|
||||||
|
|
||||||
# Unpack the filtered and sorted entries
|
|
||||||
leaderboard = [entry for entry, _ in valid_entries]
|
|
||||||
|
|
||||||
# Get all exercise names from a complete run
|
# Get all exercise names from a complete run
|
||||||
all_exercises = set()
|
all_exercises = set()
|
||||||
exercise_solutions = defaultdict(list)
|
exercise_solutions = defaultdict(list)
|
||||||
|
|
||||||
# Find a complete run to get all exercise names
|
# Find a complete run to get all exercise names
|
||||||
for entry in leaderboard:
|
for (dirname, model), results, _ in valid_entries:
|
||||||
dirname = entry["dirname"]
|
|
||||||
results = load_results(dirname)
|
|
||||||
if results and len(results) == 133: # Complete run
|
if results and len(results) == 133: # Complete run
|
||||||
all_exercises = {result["testcase"] for result in results}
|
all_exercises = {result["testcase"] for result in results}
|
||||||
break
|
break
|
||||||
|
|
||||||
for entry in leaderboard:
|
for (dirname, model), results, _ in valid_entries:
|
||||||
dirname = entry["dirname"]
|
|
||||||
model = entry["model"]
|
|
||||||
|
|
||||||
results = load_results(dirname)
|
|
||||||
if not results:
|
if not results:
|
||||||
print(f"Could not load results for {dirname}")
|
print(f"Could not load results for {dirname}")
|
||||||
continue
|
continue
|
||||||
|
@ -95,7 +102,7 @@ def analyze_exercise_solutions(topn=None):
|
||||||
|
|
||||||
# Calculate max length for alignment
|
# Calculate max length for alignment
|
||||||
max_name_len = max(len(testcase) for testcase in all_exercises)
|
max_name_len = max(len(testcase) for testcase in all_exercises)
|
||||||
total_models = len(leaderboard)
|
total_models = len(valid_entries)
|
||||||
|
|
||||||
for i, (testcase, models) in enumerate(sorted_exercises, 1):
|
for i, (testcase, models) in enumerate(sorted_exercises, 1):
|
||||||
num_solved = len(models)
|
num_solved = len(models)
|
||||||
|
@ -111,6 +118,7 @@ def analyze_exercise_solutions(topn=None):
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--topn", type=int, help="Only consider top N models by pass rate")
|
parser.add_argument("--topn", type=int, help="Only consider top N models by pass rate")
|
||||||
|
parser.add_argument("dirs", nargs="*", help="Directories to analyze (optional, defaults to leaderboard entries)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
analyze_exercise_solutions(args.topn)
|
analyze_exercise_solutions(args.dirs if args.dirs else None, args.topn)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue