This commit is contained in:
Paul Gauthier 2024-05-15 11:54:00 -07:00
parent 590cbbddfd
commit 74c0df8df8
4 changed files with 1768 additions and 16 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 86 KiB

1742
assets/models-over-time.svg Normal file

File diff suppressed because it is too large Load diff

After

Width:  |  Height:  |  Size: 53 KiB

View file

@ -1,11 +1,11 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import yaml import yaml
from datetime import datetime
from matplotlib import rc
from imgcat import imgcat from imgcat import imgcat
from matplotlib import rc
def plot_over_time(yaml_file): def plot_over_time(yaml_file):
with open(yaml_file, 'r') as file: with open(yaml_file, "r") as file:
data = yaml.safe_load(file) data = yaml.safe_load(file)
dates = [] dates = []
@ -13,10 +13,10 @@ def plot_over_time(yaml_file):
models = [] models = []
for entry in data: for entry in data:
if 'released' in entry and 'pass_rate_2' in entry: if "released" in entry and "pass_rate_2" in entry:
dates.append(entry['released']) dates.append(entry["released"])
pass_rates.append(entry['pass_rate_2']) pass_rates.append(entry["pass_rate_2"])
models.append(entry['model'].split('(')[0].strip()) models.append(entry["model"].split("(")[0].strip())
plt.rcParams["hatch.linewidth"] = 0.5 plt.rcParams["hatch.linewidth"] = 0.5
plt.rcParams["hatch.color"] = "#444444" plt.rcParams["hatch.color"] = "#444444"
@ -26,22 +26,32 @@ def plot_over_time(yaml_file):
fig, ax = plt.subplots(figsize=(10, 5)) fig, ax = plt.subplots(figsize=(10, 5))
ax.grid(axis="y", zorder=0, lw=0.2) ax.grid(axis="y", zorder=0, lw=0.2)
for spine in ax.spines.values(): for spine in ax.spines.values():
spine.set_edgecolor('#DDDDDD') spine.set_edgecolor("#DDDDDD")
spine.set_linewidth(0.5) spine.set_linewidth(0.5)
colors = ['red' if 'gpt-4' in model else 'green' if 'gpt-3.5' in model else 'blue' for model in models] colors = [
"red" if "gpt-4" in model else "green" if "gpt-3.5" in model else "blue" for model in models
]
ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120) ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120)
for i, model in enumerate(models): for i, model in enumerate(models):
ax.annotate(model, (dates[i], pass_rates[i]), fontsize=12, alpha=0.75, ax.annotate(
xytext=(5, 5), textcoords='offset points') model,
(dates[i], pass_rates[i]),
fontsize=12,
alpha=0.75,
xytext=(5, 5),
textcoords="offset points",
)
ax.set_xlabel('Model release date', fontsize=18) ax.set_xlabel("Model release date", fontsize=18)
ax.set_ylabel('Aider code editing benchmark,\npercent completed correctly', fontsize=18) ax.set_ylabel("Aider code editing benchmark,\npercent completed correctly", fontsize=18)
ax.set_title('LLM code editing skill by model release date', fontsize=20) ax.set_title("LLM code editing skill by model release date", fontsize=20)
plt.tight_layout() plt.tight_layout()
plt.savefig("tmp_over_time.png") plt.savefig("tmp_over_time.png")
plt.savefig("tmp_over_time.svg")
imgcat(fig) imgcat(fig)
# Example usage # Example usage
plot_over_time('_data/edit_leaderboard.yml') plot_over_time("_data/edit_leaderboard.yml")

View file

@ -187,7 +187,7 @@ Therefore, results are available for fewer models.
## LLM code editing skill by model release date ## LLM code editing skill by model release date
[![connecting to many LLMs](/assets/models-over-time.png)](https://aider.chat/assets/models-over-time.png) [![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg)
## Notes on benchmarking results ## Notes on benchmarking results