From 6fe4e0497a8a2cbbdb65f96b9888b0244d3e1800 Mon Sep 17 00:00:00 2001
From: Michal Mikolas <nanuqcz@gmail.com>
Date: Fri, 11 Apr 2025 15:25:20 +0200
Subject: [PATCH 1/2] Benchmark: Improved stats, now also printing stats for
 each individual test above the benchmark summary.

---
 benchmark/benchmark.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index f05c4b039..ec5a239dd 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -456,6 +456,47 @@ def load_results(dirname, stats_languages=None):
 def summarize_results(dirname, stats_languages=None):
     all_results = load_results(dirname, stats_languages)
 
+    # Print summary for each individual test
+    column_names = {
+        # key: name-for-table
+        'testdir': 'testdir',
+        'tests_outcomes': 'pass/fail',
+        'test_timeouts': 'timeouts',
+        'syntax_errors': 'syn_err',
+        'num_user_asks': 'user_asks',
+        'num_malformed_responses': 'malformed',
+        'num_exhausted_context_windows': 'exhausted',
+        'num_error_outputs': 'error',
+        'lazy_comments': 'lazy',
+        'indentation_errors': 'ind_err',
+    }
+
+    table_data = {}
+    for result in all_results:
+        for column_key, column_name in column_names.items():
+            if column_name not in table_data:
+                table_data[column_name] = []
+            
+            value = result[column_key] if column_key in result else ''
+            if column_key == 'testdir':
+                value = re.sub(r'^.+((/[^/]+){4})$', '\\1', value).strip('/')  # shorten the long path to fit into the console
+                value = value.replace('exercises/practice', '...')
+            if column_key == 'tests_outcomes':
+                value = ', '.join([('P' if v else 'f') for v in value])  # Pass or Fail
+                
+            table_data[column_name].append(value)
+
+    df = pd.DataFrame(table_data)
+    df.index = df.index + 1  # Print index starting from 1
+    print(df.to_string(
+        justify='left',  # align left for HEADER
+        formatters={     # align left for string VALUES must be handled like this
+            'testdir': lambda x: str(x).ljust( max(df['testdir'].astype(str).map(len).max(), len('testdir')) ),
+            'pass/fail': lambda x: str(x).ljust( max(df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ),
+        }
+    ))
+
+    # Print overall summary for whole benchmark
     res = SimpleNamespace()
     res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*")))
 

From 7928820d186e85d67ccbb319cffc7e27817c6f8a Mon Sep 17 00:00:00 2001
From: Michal Mikolas <nanuqcz@gmail.com>
Date: Sat, 19 Apr 2025 00:31:29 +0200
Subject: [PATCH 2/2] Benchmark: Improved stats, now also printing stats for
 each language above the benchmark summary.

---
 benchmark/benchmark.py | 75 ++++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index ec5a239dd..109113238 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -456,7 +456,7 @@ def load_results(dirname, stats_languages=None):
 def summarize_results(dirname, stats_languages=None):
     all_results = load_results(dirname, stats_languages)
 
-    # Print summary for each individual test
+    # Each test & Each language summary
     column_names = {
         # key: name-for-table
         'testdir': 'testdir',
@@ -471,32 +471,73 @@ def summarize_results(dirname, stats_languages=None):
         'indentation_errors': 'ind_err',
     }
 
-    table_data = {}
+    # Tests data
+    tests_data = {}
     for result in all_results:
         for column_key, column_name in column_names.items():
-            if column_name not in table_data:
-                table_data[column_name] = []
+            if column_name not in tests_data:
+                tests_data[column_name] = []
             
             value = result[column_key] if column_key in result else ''
             if column_key == 'testdir':
-                value = re.sub(r'^.+((/[^/]+){4})$', '\\1', value).strip('/')  # shorten the long path to fit into the console
+                value = re.sub(r'^.+((/[^/]+){4})$', '\\1', value).strip('/')
                 value = value.replace('exercises/practice', '...')
             if column_key == 'tests_outcomes':
-                value = ', '.join([('P' if v else 'f') for v in value])  # Pass or Fail
+                value = ', '.join([('P' if v else 'f') for v in value])
                 
-            table_data[column_name].append(value)
+            tests_data[column_name].append(value)
 
-    df = pd.DataFrame(table_data)
-    df.index = df.index + 1  # Print index starting from 1
-    print(df.to_string(
-        justify='left',  # align left for HEADER
-        formatters={     # align left for string VALUES must be handled like this
-            'testdir': lambda x: str(x).ljust( max(df['testdir'].astype(str).map(len).max(), len('testdir')) ),
-            'pass/fail': lambda x: str(x).ljust( max(df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ),
-        }
-    ))
+    # Languages data
+    langs_data = {}
+    for column_name in tests_data.keys():
+        if column_name not in langs_data:
+            langs_data[column_name] = []
+    
+        sum1, sum2 = 0, 0
+        for i, column_value in enumerate(tests_data[column_name]):
+            is_next_same_lang = tests_data['testdir'][i].split('/')[0] == tests_data['testdir'][i+1].split('/')[0] if (i + 1) < len(tests_data['testdir']) else False
 
-    # Print overall summary for whole benchmark
+            if column_name == 'testdir':
+                if not is_next_same_lang:
+                    langs_data[column_name].append( column_value.split('/')[0] + '/...' )
+
+            elif column_name == 'pass/fail':
+                sum1 += 1 if column_value[-1] == 'P' else 0
+                sum2 += 1 if column_value[-1] == 'f' else 0
+                if not is_next_same_lang:
+                    langs_data[column_name].append(f'{sum1} / {sum2}')
+                    sum1, sum2 = 0, 0
+
+            else:
+                sum1 += column_value
+                if not is_next_same_lang:
+                    langs_data[column_name].append(sum1)
+                    sum1 = 0
+
+    # Print
+    tests_data_df = pd.DataFrame(tests_data)
+    tests_data_df.index = tests_data_df.index + 1  # Print index starting from 1
+    langs_data_df = pd.DataFrame(langs_data)
+    langs_data_df.index = langs_data_df.index + 1  # Print index starting from 1
+    print(
+        '\n\n' + tests_data_df.to_string(
+            justify='left',  # align left for HEADER
+            formatters={     # align left for string VALUES must be handled like this
+                'testdir': lambda x: str(x).ljust( max(tests_data_df['testdir'].astype(str).map(len).max(), len('testdir')) ),
+                'pass/fail': lambda x: str(x).ljust( max(tests_data_df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ),
+            }
+        ) +
+        '\n\n' + langs_data_df.to_string(
+            justify='left',  # align left for HEADER
+            formatters={     # align left for string VALUES must be handled like this
+                'testdir': lambda x: str(x).ljust( max(langs_data_df['testdir'].astype(str).map(len).max(), len('testdir')) ),
+                'pass/fail': lambda x: str(x).ljust( max(langs_data_df['pass/fail'].astype(str).map(len).max(), len('pass/fail')) ),
+            }
+        ) +
+        '\n'
+    )
+
+    # Overall summary for whole benchmark
     res = SimpleNamespace()
     res.total_tests = len(list(Path(dirname).glob("*/exercises/practice/*")))