mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-20 12:24:59 +00:00
feat: Disqualify exercises with >=4 parse errors
This commit is contained in:
parent
5a0d4eff71
commit
14a8759b82
1 changed files with 38 additions and 4 deletions
|
@ -28,6 +28,8 @@ def load_results(dirname):
|
|||
return None
|
||||
|
||||
all_results = []
|
||||
parse_errors = [] # Track which exercises had parse errors for this model
|
||||
|
||||
# Look in language subdirectories under exercises/practice
|
||||
for fname in benchmark_dir.glob("*/exercises/practice/*/.aider.results.json"):
|
||||
try:
|
||||
|
@ -36,13 +38,21 @@ def load_results(dirname):
|
|||
lang = fname.parts[-5] # Get language from path
|
||||
results["language"] = lang
|
||||
all_results.append(results)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
# Track the parse error for this exercise/model combination
|
||||
lang = fname.parts[-5]
|
||||
exercise = f"{fname.parts[-2]}/{lang}" # Use directory name as testcase
|
||||
parse_errors.append(exercise)
|
||||
print(f"Failed to parse {fname}")
|
||||
continue
|
||||
return all_results
|
||||
|
||||
return all_results, parse_errors
|
||||
|
||||
|
||||
def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False):
|
||||
PARSE_ERROR_M = 4 # Threshold for number of parse errors to DQ an exercise
|
||||
|
||||
if dirs is None:
|
||||
# Use leaderboard data if no directories specified
|
||||
dir_entries = get_dirs_from_leaderboard()
|
||||
|
@ -52,9 +62,13 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False):
|
|||
|
||||
# Filter out entries that don't load and sort by pass rate
|
||||
valid_entries = []
|
||||
parse_errors_by_model = {} # Track which exercises had parse errors for each model
|
||||
|
||||
for dirname, model in dir_entries:
|
||||
results = load_results(dirname)
|
||||
if results:
|
||||
results_data = load_results(dirname)
|
||||
if results_data:
|
||||
results, model_parse_errors = results_data
|
||||
parse_errors_by_model[model] = set(model_parse_errors)
|
||||
# Calculate pass rate for sorting when using custom dirs
|
||||
if dirs is not None:
|
||||
pass_rate = sum(
|
||||
|
@ -181,10 +195,30 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False):
|
|||
cumsum += count
|
||||
print(f"{i:>6d} {count:>9d} {cumsum:>10d}")
|
||||
|
||||
# Count parse errors per exercise
|
||||
parse_error_counts = defaultdict(int)
|
||||
for model_errors in parse_errors_by_model.values():
|
||||
for exercise in model_errors:
|
||||
parse_error_counts[exercise] += 1
|
||||
|
||||
# Find exercises to disqualify based on parse error threshold
|
||||
disqualified_exercises = {
|
||||
exercise for exercise, count in parse_error_counts.items()
|
||||
if count >= PARSE_ERROR_M
|
||||
}
|
||||
|
||||
if disqualified_exercises:
|
||||
print(f"\nDisqualified {len(disqualified_exercises)} exercises with {PARSE_ERROR_M}+ parse errors:")
|
||||
for ex in sorted(disqualified_exercises):
|
||||
print(f" {ex} ({parse_error_counts[ex]} parse errors)")
|
||||
|
||||
# Collect the hard set (exercises solved by HARD_SET_NUM or fewer models)
|
||||
print(f"\nHard Set Analysis (exercises solved by ≤{HARD_SET_NUM} models):")
|
||||
print("-" * 60)
|
||||
hard_set = {ex for ex, models in exercise_solutions.items() if len(models) <= HARD_SET_NUM}
|
||||
hard_set = {
|
||||
ex for ex, models in exercise_solutions.items()
|
||||
if len(models) <= HARD_SET_NUM and ex not in disqualified_exercises
|
||||
}
|
||||
print(f"Total hard set exercises: {len(hard_set)}")
|
||||
|
||||
# Count total problems, unsolved problems, and hard set problems by language
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue