Add enhanced benchmark metrics including API calls, retries, and language-specific pass rates

This commit is contained in:
AJ 2025-04-23 19:27:24 -07:00
parent 45413ce815
commit 04abec4c10

View file

@ -480,6 +480,14 @@ def summarize_results(dirname, stats_languages=None):
passed_tests = [0] * tries
# Initialize language-specific tracking
languages = set()
language_tests = defaultdict(int)
language_passed = defaultdict(lambda: [0] * tries)
# Initialize new metrics
res.total_api_calls = 0
res.completed_tests = 0
res.duration = 0
res.cost = 0
@ -508,6 +516,18 @@ def summarize_results(dirname, stats_languages=None):
for i in range(len(tests_outcomes) - 1, tries):
passed_tests[i] += 1
# Track language-specific results
language = results.get("language")
if language:
languages.add(language)
language_tests[language] += 1
if passed:
for i in range(len(tests_outcomes) - 1, tries):
language_passed[language][i] += 1
# Track API calls
res.total_api_calls += results.get("num_api_calls", 0)
res.cost += results.get("cost", 0)
res.duration += results.get("duration", 0)
res.test_timeouts += results.get("test_timeouts", 0)
@ -582,6 +602,24 @@ def summarize_results(dirname, stats_languages=None):
pct_well_formed = 1.0 - res.num_with_malformed_responses / res.completed_tests
print(f" percent_cases_well_formed: {pct_well_formed * 100:.1f}")
# Display API calls
print(f" total_api_calls: {res.total_api_calls}")
if res.completed_tests > 0:
print(f" avg_api_calls_per_test: {res.total_api_calls / res.completed_tests:.2f}")
# Display language-specific pass rates
if languages:
print("\n Language-specific pass rates:")
for language in sorted(languages):
for i in range(tries):
if language_tests[language] > 0:
lang_pass_rate = 100 * language_passed[language][i] / language_tests[language]
print(f" {language}_pass_rate_{i + 1}: {lang_pass_rate:.1f}")
# Store in the result object for potential use in graphs
setattr(res, f"{language}_pass_rate_{i + 1}", f"{lang_pass_rate:.1f}")
setattr(res, f"{language}_pass_num_{i + 1}", language_passed[language][i])
setattr(res, f"{language}_tests", language_tests[language])
show("error_outputs")
show("num_malformed_responses")
show("num_with_malformed_responses")
@ -932,11 +970,19 @@ def run_test_real(
if verbose:
print(f"Failed to clean up Node.js node_modules directory: {e}")
# Get language from the testdir path
language = None
for part in testdir.parts:
if part in ["python", "javascript", "java", "cpp", "go", "rust"]:
language = part
break
results = dict(
testdir=str(testdir),
testcase=testdir.name,
model=main_model.name,
edit_format=edit_format,
language=language, # Add language information
tests_outcomes=test_outcomes,
cost=coder.total_cost,
duration=dur,