diff --git a/benchmark/over_time.py b/benchmark/over_time.py index afae1d26e..7195d52c3 100644 --- a/benchmark/over_time.py +++ b/benchmark/over_time.py @@ -12,6 +12,9 @@ def get_model_color(model): if model == "gpt-4o-mini": return default + if "qwen" in model.lower(): + return "purple" + if "haiku" in model.lower(): return "pink" @@ -85,6 +88,7 @@ def plot_over_time(yaml_file): orange_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "orange"] brown_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "brown"] pink_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "pink"] + qwen_points = [(d, r) for d, r, c in zip(dates, pass_rates, colors) if c == "purple"] # Plot lines for purple, red, green, orange and brown points if purple_points: @@ -105,6 +109,9 @@ def plot_over_time(yaml_file): if pink_points: pink_dates, pink_rates = zip(*sorted(pink_points)) ax.plot(pink_dates, pink_rates, c="pink", alpha=0.5, linewidth=1) + if qwen_points: + qwen_dates, qwen_rates = zip(*sorted(qwen_points)) + ax.plot(qwen_dates, qwen_rates, c="purple", alpha=0.5, linewidth=1) # Plot all points ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)