diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index a729e0c19..6261e00f9 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -317,6 +317,7 @@ def summarize_results(dirname): res.test_timeouts = 0 res.exhausted_context_windows = 0 res.num_malformed_responses = 0 + res.num_with_malformed_responses = 0 res.syntax_errors = 0 res.indentation_errors = 0 res.lazy_comments = 0 @@ -342,6 +343,8 @@ def summarize_results(dirname): res.user_asks += results.get("num_user_asks", 0) res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0) res.num_malformed_responses += results.get("num_malformed_responses", 0) + if results.get("num_malformed_responses"): + res.num_with_malformed_responses += 1 res.lazy_comments += results.get("lazy_comments", 0) res.syntax_errors += results.get("syntax_errors", 0) @@ -392,11 +395,12 @@ def summarize_results(dirname): for i in range(tries): print(f" pass_rate_{i+1}: {percents[i]:.1f}") - pct_well_formed = 1.0 - res.num_malformed_responses / res.completed_tests + pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests print(f" percent_cases_well_formed: {pct_well_formed*100:.1f}") show("error_outputs") show("num_malformed_responses") + show("num_with_malformed_responses") show("user_asks") show("lazy_comments") show("syntax_errors")