diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 9f1ff2b4c..6b7417c5d 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -506,6 +506,7 @@ def summarize_results(dirname): percents[i] = pass_rate # console.print(f"{pass_rate:.1f}% correct after try {i+1}") setattr(res, f"pass_rate_{i + 1}", f"{pass_rate:.1f}") + setattr(res, f"pass_num_{i + 1}", passed_tests[i]) print(f"- dirname: {dirname.name}") style = None if res.completed_tests in NUM_TESTS else "red" @@ -521,6 +522,8 @@ def summarize_results(dirname): for i in range(tries): print(f" pass_rate_{i + 1}: {percents[i]:.1f}") + for i in range(tries): + print(f" pass_num_{i + 1}: {passed_tests[i]}") pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}") @@ -661,6 +664,9 @@ def run_test_real( test_files = config.get("files", {}).get("test", []) + ignore_files = set(["Cargo.toml"]) + ignore_files.update(test_files) + # Copy all solution files for file_path in solution_files: src = testdir / Path(file_path) @@ -746,7 +752,7 @@ def run_test_real( # auto_lint=False, # disabled for code-in-json experiments cache_prompts=True, suggest_shell_commands=False, - ignore_mentions=set(test_files), + ignore_mentions=ignore_files, ) dump(coder.ignore_mentions) @@ -852,7 +858,7 @@ def run_test_real( def run_unit_tests(original_dname, testdir, history_fname, test_files): - timeout = 60 + timeout = 60 * 3 # Map of file extensions to test commands TEST_COMMANDS = { diff --git a/benchmark/problem_stats.py b/benchmark/problem_stats.py index f69578629..ca4e48ed9 100755 --- a/benchmark/problem_stats.py +++ b/benchmark/problem_stats.py @@ -201,13 +201,13 @@ def analyze_exercise_solutions(dirs=None, topn=None, copy_hard_set=False): lang_hard_set[lang] += 1 print("\nUnsolved and hard set problems by language:") - print(f"{'Language':<12} {'Unsolved':>8} {'Hard Set':>9} {'Total':>7} {'Percent':>8}") + print(f"{'Language':<12} {'Unsolved':>8} {'Hard Set':>9} {'Total':>7} {'%hardUnsolved':>8}") print("-" * 47) for lang in sorted(lang_totals.keys()): count = lang_unsolved[lang] hard = lang_hard_set[lang] total = lang_totals[lang] - pct = (count / total) * 100 + pct = (count / hard) * 100 print(f"{lang:<12} {count:>8} {hard:>9} {total:>7} {pct:>7.1f}%") print()