diff --git a/assets/2024-03-07-claude-3.svg b/assets/2024-03-07-claude-3.svg index 9e0d4e17d..c0404565b 100644 --- a/assets/2024-03-07-claude-3.svg +++ b/assets/2024-03-07-claude-3.svg @@ -6,7 +6,7 @@ - 2024-03-08T08:22:32.649856 + 2024-03-09T08:19:34.532985 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -363,7 +363,7 @@ z - + @@ -428,7 +428,7 @@ z - + @@ -461,7 +461,7 @@ z - + @@ -521,11 +521,11 @@ z - + - + - + - + - + @@ -669,7 +669,7 @@ z - + @@ -702,7 +702,7 @@ z - + @@ -936,7 +936,7 @@ z - + @@ -976,16 +976,16 @@ z +" clip-path="url(#p5df431dcb0)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -999,11 +999,11 @@ L -3.5 0 +" clip-path="url(#p5df431dcb0)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1018,11 +1018,11 @@ L 421.137924 161.262708 +" clip-path="url(#p5df431dcb0)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1037,11 +1037,11 @@ L 421.137924 127.370313 +" clip-path="url(#p5df431dcb0)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1056,11 +1056,11 @@ L 421.137924 93.477917 +" clip-path="url(#p5df431dcb0)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1116,11 +1116,11 @@ z +" clip-path="url(#p5df431dcb0)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1302,7 +1302,7 @@ L 87.352583 195.155104 L 87.352583 97.045537 L 64.779451 97.045537 z -" clip-path="url(#p26985c7f5f)" style="fill: url(#h44c145aa42); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#hbe79494d0d); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #d1b3e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#h6479e354e1); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: url(#hbe79494d0d); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> +" clip-path="url(#p5df431dcb0)" style="fill: #d1b3e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> + - + - + 1: + edge = dict( + edgecolor="#ffffff", + linewidth=1.5, + ) + else: + edge = dict() + if zorder == 2: + edge["label"] = "??" + + color = [ + "#b3e6a8", + "#b3e6a8", + "#b3e6a8", + "#b3e6a8", + "#b3d1e6", + "#b3d1e6", + "#b3d1e6", + "#e6b3b3", + "#d1b3e6", + ] + hatch = [ + "", + "", + "", + "", + "////", + "////", + "////", + "", + "////", + ] + hatch = [ + "////", + "////", + "////", + "////", + "", + "", + "", + "////", + "", + ] + rects = ax.bar( + pos + 0.5 * width, + df.iloc[:, 1], + width * 0.95, + color=color, + hatch=hatch, + zorder=zorder, + **edge, + ) + if zorder == 2: + ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6) + + ax.set_xticks([p + 0.5 * width for p in pos]) + model_labels = [] + for model in df.iloc[:, 0]: + pieces = model.split("-") + N = 3 + ml = "-".join(pieces[:N]) + if pieces[N:]: + ml += "-\n" + "-".join(pieces[N:]) + model_labels.append(ml) + + ax.set_xticklabels(model_labels, rotation=60) + + top = 95 + ax.annotate( + "First attempt,\nbased on\nnatural language\ninstructions", + xy=(2.0, 41), + xytext=(1.75, top), + horizontalalignment="center", + verticalalignment="top", + arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, + ) + ax.annotate( + "Second attempt,\nincluding unit test\nerror output", + xy=(2.55, 56), + xytext=(3.9, top), + horizontalalignment="center", + verticalalignment="top", + arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"}, + ) + + ax.set_ylabel("Percent of exercises completed successfully") + # ax.set_xlabel("Model") + ax.set_title("Code Editing Skill") + # ax.legend( + # title="Model family", + # loc="upper left", + # ) + ax.set_ylim(top=100) + + plt.tight_layout() + plt.savefig("tmp.svg") + imgcat(fig) + + # df.to_csv("tmp.benchmarks.csv") + + def plot_refactoring(df): tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]