aider/benchmark/rungrid.py

#!/usr/bin/env python

import subprocess
import sys

from aider.dump import dump  # noqa: F401


def main():
    models = [
        "gpt-3.5-turbo-0301",
        "gpt-3.5-turbo-0613",
        # "gpt-3.5-turbo-16k-0613",
        "gpt-3.5-turbo-1106",
        # "gpt-4-0314",
        # "gpt-4-0613",
    ]
    edit_formats = [
        "diff",
        # "diff-func",
        # "whole",
        # "whole-func",
    ]

    # for repeat in range(1, 2, 1):
    for model in models:
        for edit_format in edit_formats:
            # dump(model, edit_format)

            if "-func" in edit_format and "-03" in model:
                continue

            # if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"):
            #    # sublist reliably hangs the API?
            #    continue

            dirname = f"rungrid-nov-{model}-{edit_format}"
            # dirname = f"rungrid-{model}-{edit_format}-repeat-{repeat}"
            run(dirname, model, edit_format)


def run(dirname, model, edit_format):
    cmd = [
        "./benchmark/benchmark.py",
        dirname,
        "--model",
        model,
        "--edit-format",
        edit_format,
        "--threads",
        "10",
        "--cont",
    ]
    print(" ".join(cmd))

    subprocess.run(cmd, check=True)


if __name__ == "__main__":
    status = main()
    sys.exit(status)