From 9ad41e922938bc023babf69b5e7f60b17efb0db9 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Mon, 26 Aug 2024 20:36:38 -0700 Subject: [PATCH] docs: Expand details on Sonnet performance graph --- aider/website/_posts/2024-08-26-sonnet-seems-fine.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/aider/website/_posts/2024-08-26-sonnet-seems-fine.md b/aider/website/_posts/2024-08-26-sonnet-seems-fine.md index 06b48084f..e915253d8 100644 --- a/aider/website/_posts/2024-08-26-sonnet-seems-fine.md +++ b/aider/website/_posts/2024-08-26-sonnet-seems-fine.md @@ -19,7 +19,12 @@ dumbed-down, nerfed or otherwise performing worse lately. Sonnet seems as good as ever, at least when accessed via the API. -Here's a graph showing the performance of Claude 3.5 Sonnet over time: +Here's a graph showing the performance of Claude 3.5 Sonnet over time. +It shows every benchmark run performed since Sonnet launched. +Benchmarks were performed for various reasons, usually +to evaluate the effects of small changes to aider's system prompts. +There is always some variance in benchmark results, typically +/- 1-2% +between runs with identical prompts.
@@ -87,6 +92,8 @@ document.addEventListener('DOMContentLoaded', function() { }); -This graph shows the performance of Claude 3.5 Sonnet on the aider code editing benchmark over time. 'Pass Rate 1' represents the initial success rate, while 'Pass Rate 2' shows the success rate after a second attempt. As you can see, there's no significant decline in performance, suggesting that Sonnet's capabilities have remained stable since its launch. +This graph shows the performance of Claude 3.5 Sonnet on the +[aider code editing benchmark](https://aider.chat/docs/leaderboards/) +over time. 'Pass Rate 1' represents the initial success rate, while 'Pass Rate 2' shows the success rate after a second attempt with a chance to fix testing errors.