diff --git a/_data/edit_leaderboard.yml b/_data/edit_leaderboard.yml index a08f596f7..0d90394b3 100644 --- a/_data/edit_leaderboard.yml +++ b/_data/edit_leaderboard.yml @@ -343,7 +343,6 @@ - dirname: 2024-05-07-12-55-06--deepseek-chat-v2-whole test_cases: 133 model: deepseek-chat v2 (whole) - released: 2024-05-06 edit_format: whole commit_hash: b1cae73, db994fb pass_rate_1: 50.4 diff --git a/benchmark/over_time.py b/benchmark/over_time.py index 485982f74..d1aa5ca92 100644 --- a/benchmark/over_time.py +++ b/benchmark/over_time.py @@ -23,22 +23,22 @@ def plot_over_time(yaml_file): rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10}) - fig, ax = plt.subplots(figsize=(10, 4)) + fig, ax = plt.subplots(figsize=(10, 5)) ax.grid(axis="y", zorder=0, lw=0.2) for spine in ax.spines.values(): spine.set_edgecolor('#DDDDDD') spine.set_linewidth(0.5) colors = ['red' if 'gpt-4' in model else 'green' if 'gpt-3.5' in model else 'blue' for model in models] - ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=100) + ax.scatter(dates, pass_rates, c=colors, alpha=0.5, s=120) for i, model in enumerate(models): ax.annotate(model, (dates[i], pass_rates[i]), fontsize=12, alpha=0.75, xytext=(5, 5), textcoords='offset points') - ax.set_xlabel('Model release date', fontsize=14) - ax.set_ylabel('Aider code editing benchmark,\npercent completed correctly', fontsize=14) - ax.set_title('LLM code editing skill by model release date') + ax.set_xlabel('Model release date', fontsize=18) + ax.set_ylabel('Aider code editing benchmark,\npercent completed correctly', fontsize=18) + ax.set_title('LLM code editing skill by model release date', fontsize=20) plt.tight_layout() plt.savefig("tmp_over_time.png") imgcat(fig)