From 6fe4e0497a8a2cbbdb65f96b9888b0244d3e1800 Mon Sep 17 00:00:00 2001 From: Michal Mikolas Date: Fri, 11 Apr 2025 15:25:20 +0200 Subject: [PATCH 1/2] Benchmark: Improved stats, now also printing stats for each individual test above the benchmark summary. --- benchmark/benchmark.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index f05c4b039..ec5a239dd 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -456,6 +456,47 @@ def load_results(dirname, stats_languages=None): def summarize_results(dirname, stats_languages=None): all_results = load_results(dirname, stats_languages) + # Print summary for each individual test + column_names = { + # key: name-for-table + 'testdir': 'testdir', + 'tests_outcomes': 'pass/fail', + 'test_timeouts': 'timeouts', + 'syntax_errors': 'syn_err', + 'num_user_asks': 'user_asks', + 'num_malformed_responses': 'malformed', + 'num_exhausted_context_windows': 'exhausted', + 'num_error_outputs': 'error', + 'lazy_comments': 'lazy', + 'indentation_errors': 'ind_err', + } + + table_data = {} + for result in all_results: + for column_key, column_name in column_names.items(): + if column_name not in table_data: + table_data[column_name] = [] + + value = result[column_key] if column_key in result else '' + if column_key == 'testdir': + value = re.sub(r'^.+((/[^/]+){4})$', '\\1', value).strip('/') # shorten the long path to fit into the console + value = value.replace('exercises/practice', '...') + if column_key == 'tests_outcomes': + value = ', '.join([('P' if v else 'f') for v in value]) # Pass or Fail + + table_data[column_name].append(value) + + df = pd.DataFrame(table_data) + df.index = df.index + 1 # Print index starting from 1 + print(df.to_string( + justify='left', # align left for HEADER + formatters={ # align left for string VALUES must be handled like this + 'testdir': lambda x: str(x).ljust( max(df['testdir'].astype(str).map(len).max(), len('testdir')) ), + 'pass/fail': lambda x: str(x).ljust( max(df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ), + } + )) + + # Print overall summary for whole benchmark res = SimpleNamespace() res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*"))) From 7928820d186e85d67ccbb319cffc7e27817c6f8a Mon Sep 17 00:00:00 2001 From: Michal Mikolas Date: Sat, 19 Apr 2025 00:31:29 +0200 Subject: [PATCH 2/2] Benchmark: Improved stats, now also printing stats for each language above the benchmark summary. --- benchmark/benchmark.py | 75 ++++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 17 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index ec5a239dd..109113238 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -456,7 +456,7 @@ def load_results(dirname, stats_languages=None): def summarize_results(dirname, stats_languages=None): all_results = load_results(dirname, stats_languages) - # Print summary for each individual test + # Each test & Each language summary column_names = { # key: name-for-table 'testdir': 'testdir', @@ -471,32 +471,73 @@ def summarize_results(dirname, stats_languages=None): 'indentation_errors': 'ind_err', } - table_data = {} + # Tests data + tests_data = {} for result in all_results: for column_key, column_name in column_names.items(): - if column_name not in table_data: - table_data[column_name] = [] + if column_name not in tests_data: + tests_data[column_name] = [] value = result[column_key] if column_key in result else '' if column_key == 'testdir': - value = re.sub(r'^.+((/[^/]+){4})$', '\\1', value).strip('/') # shorten the long path to fit into the console + value = re.sub(r'^.+((/[^/]+){4})$', '\\1', value).strip('/') value = value.replace('exercises/practice', '...') if column_key == 'tests_outcomes': - value = ', '.join([('P' if v else 'f') for v in value]) # Pass or Fail + value = ', '.join([('P' if v else 'f') for v in value]) - table_data[column_name].append(value) + tests_data[column_name].append(value) - df = pd.DataFrame(table_data) - df.index = df.index + 1 # Print index starting from 1 - print(df.to_string( - justify='left', # align left for HEADER - formatters={ # align left for string VALUES must be handled like this - 'testdir': lambda x: str(x).ljust( max(df['testdir'].astype(str).map(len).max(), len('testdir')) ), - 'pass/fail': lambda x: str(x).ljust( max(df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ), - } - )) + # Languages data + langs_data = {} + for column_name in tests_data.keys(): + if column_name not in langs_data: + langs_data[column_name] = [] + + sum1, sum2 = 0, 0 + for i, column_value in enumerate(tests_data[column_name]): + is_next_same_lang = tests_data['testdir'][i].split('/')[0] == tests_data['testdir'][i+1].split('/')[0] if (i + 1) < len(tests_data['testdir']) else False - # Print overall summary for whole benchmark + if column_name == 'testdir': + if not is_next_same_lang: + langs_data[column_name].append( column_value.split('/')[0] + '/...' ) + + elif column_name == 'pass/fail': + sum1 += 1 if column_value[-1] == 'P' else 0 + sum2 += 1 if column_value[-1] == 'f' else 0 + if not is_next_same_lang: + langs_data[column_name].append(f'{sum1} / {sum2}') + sum1, sum2 = 0, 0 + + else: + sum1 += column_value + if not is_next_same_lang: + langs_data[column_name].append(sum1) + sum1 = 0 + + # Print + tests_data_df = pd.DataFrame(tests_data) + tests_data_df.index = tests_data_df.index + 1 # Print index starting from 1 + langs_data_df = pd.DataFrame(langs_data) + langs_data_df.index = langs_data_df.index + 1 # Print index starting from 1 + print( + '\n\n' + tests_data_df.to_string( + justify='left', # align left for HEADER + formatters={ # align left for string VALUES must be handled like this + 'testdir': lambda x: str(x).ljust( max(tests_data_df['testdir'].astype(str).map(len).max(), len('testdir')) ), + 'pass/fail': lambda x: str(x).ljust( max(tests_data_df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ), + } + ) + + '\n\n' + langs_data_df.to_string( + justify='left', # align left for HEADER + formatters={ # align left for string VALUES must be handled like this + 'testdir': lambda x: str(x).ljust( max(langs_data_df['testdir'].astype(str).map(len).max(), len('testdir')) ), + 'pass/fail': lambda x: str(x).ljust( max(langs_data_df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ), + } + ) + + '\n' + ) + + # Overall summary for whole benchmark res = SimpleNamespace() res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*")))