aider/benchmark/over_time.py

from dataclasses import dataclass
from datetime import date
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import yaml
from imgcat import imgcat
from matplotlib import rc


@dataclass
class ModelData:
    name: str
    release_date: date
    pass_rate: float

    @property
    def color(self) -> str:
        model = self.name.lower()
        if "gemini" in model and "pro" in model:
            return "magenta"
        if "qwen" in model:
            return "darkblue"
        if "mistral" in model:
            return "cyan"
        if "haiku" in model:
            return "pink"
        if "deepseek" in model:
            return "brown"
        if "sonnet" in model:
            return "orange"
        if "-4o" in model:
            return "purple"
        if "gpt-4" in model:
            return "red"
        if "gpt-3.5" in model:
            return "green"
        return "lightblue"

    @property
    def legend_label(self) -> str:
        model = self.name.lower()
        if "gemini" in model and "pro" in model:
            return "Gemini 1.5 Pro"
        if "claude-3-sonnet" in model:
            return "Sonnet"
        if "o1-preview" in model:
            return "O1 Preview"
        if "gpt-3.5" in model:
            return "GPT-3.5 Turbo"
        if "gpt-4-" in model and "-4o" not in model:
            return "GPT-4"
        if "qwen" in model:
            return "Qwen"
        if "-4o" in model:
            return "GPT-4o"
        if "haiku" in model:
            return "Haiku"
        if "deepseek" in model:
            return "DeepSeek"
        if "mistral" in model:
            return "Mistral"
        return model


class BenchmarkPlotter:
    LABEL_FONT_SIZE = 16

    def __init__(self):
        self.setup_plot_style()

    def setup_plot_style(self):
        plt.rcParams["hatch.linewidth"] = 0.5
        plt.rcParams["hatch.color"] = "#444444"
        rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
        plt.rcParams["text.color"] = "#444444"

    def load_data(self, yaml_file: str) -> List[ModelData]:
        with open(yaml_file, "r") as file:
            data = yaml.safe_load(file)

        models = []
        for entry in data:
            if "released" in entry and "pass_rate_2" in entry:
                model = ModelData(
                    name=entry["model"].split("(")[0].strip(),
                    release_date=entry["released"],
                    pass_rate=entry["pass_rate_2"],
                )
                models.append(model)
        return models

    def create_figure(self) -> Tuple[plt.Figure, plt.Axes]:
        fig, ax = plt.subplots(figsize=(12, 8))
        ax.grid(axis="y", zorder=0, lw=0.2)
        for spine in ax.spines.values():
            spine.set_edgecolor("#DDDDDD")
            spine.set_linewidth(0.5)
        return fig, ax

    def plot_model_series(self, ax: plt.Axes, models: List[ModelData]):
        # Group models by color
        color_groups: Dict[str, List[ModelData]] = {}
        for model in models:
            if model.color not in color_groups:
                color_groups[model.color] = []
            color_groups[model.color].append(model)

        # Plot each color group
        for color, group in color_groups.items():
            sorted_group = sorted(group, key=lambda x: x.release_date)
            dates = [m.release_date for m in sorted_group]
            rates = [m.pass_rate for m in sorted_group]

            # Plot line
            ax.plot(dates, rates, c=color, alpha=0.5, linewidth=1)

            # Plot points
            ax.scatter(dates, rates, c=color, alpha=0.5, s=120)

            # Add label for first point
            first_model = sorted_group[0]
            ax.annotate(
                first_model.legend_label,
                (first_model.release_date, first_model.pass_rate),
                xytext=(10, 5),
                textcoords="offset points",
                color=color,
                alpha=0.8,
                fontsize=self.LABEL_FONT_SIZE,
            )

    def set_labels_and_style(self, ax: plt.Axes):
        ax.set_xlabel("Model release date", fontsize=18, color="#555")
        ax.set_ylabel(
            "Aider code editing benchmark,\npercent completed correctly", fontsize=18, color="#555"
        )
        ax.set_title("LLM code editing skill by model release date", fontsize=20)
        ax.set_ylim(30, 90)
        plt.xticks(fontsize=14, rotation=45, ha="right")
        plt.tight_layout(pad=1.0)

    def save_and_display(self, fig: plt.Figure):
        plt.savefig("aider/website/assets/models-over-time.png")
        plt.savefig("aider/website/assets/models-over-time.svg")
        imgcat(fig)

    def plot(self, yaml_file: str):
        models = self.load_data(yaml_file)
        fig, ax = self.create_figure()
        self.plot_model_series(ax, models)
        self.set_labels_and_style(ax)
        self.save_and_display(fig)


def main():
    plotter = BenchmarkPlotter()
    models = plotter.load_data("aider/website/_data/edit_leaderboard.yml")

    # Print release dates and model names
    for model in sorted(models, key=lambda x: x.release_date):
        print(f"{model.release_date}: {model.name}")

    plotter.plot("aider/website/_data/edit_leaderboard.yml")


if __name__ == "__main__":
    main()