diff --git a/_posts/2024-05-22-swe-bench-lite.md b/_posts/2024-05-22-swe-bench-lite.md index d51a33ea5..1ad664c74 100644 --- a/_posts/2024-05-22-swe-bench-lite.md +++ b/_posts/2024-05-22-swe-bench-lite.md @@ -23,21 +23,21 @@ The best result reported elsewhere seems to be Aider achieved this result mainly through its focus on static code analysis, reliable LLM code editing, and pragmatic workflows for interactive pair programming with AI. -Aider intentionally has quite limited and narrow "agentic behavior": -it doesn't require a highly detailed upfront "spec" from the user, -use RAG or vector search, farm out sub-problems to an army of LLMs, -allow the LLM to use tools, -or perform web searches, -etc. +Aider intentionally has quite limited and narrow "agentic behavior" +to avoid long delays, high token costs +and the need for users to repeatedly code review incorrect solutions. +It's also worth noting that aider currently does not use +RAG, vector search, tools or give the LLM access to execute code +or run web searches. -Aider is first and foremost a tool for engineers to get real work done in -real code bases through a pair programming chat style interface. -When a user asks aider for a change, they see the edits performed in real-time, -and aider may also then offer additional -help like fixing lint or test errors. -In normal use, the user is in full interactive control. +Aider is first and foremost an interactive tool for engineers to get real work done in +real code bases using a chat interface. +Aider provides a pair programming experience where users can ask for a change +and see the edits performed in real-time. +Aider can also offer additional help like fixing lint or test errors, +but the user is always in full interactive control. This lets them quickly steer misunderstandings back on course and -avoid wasted time, code reviews and token costs. +avoid wasting time and token costs. ## Benchmark methodology diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg index c940d2604..9c525c7c4 100644 Binary files a/assets/swe_bench_lite.jpg and b/assets/swe_bench_lite.jpg differ diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg index fe7cecc1a..f5c8faf2d 100644 --- a/assets/swe_bench_lite.svg +++ b/assets/swe_bench_lite.svg @@ -6,7 +6,7 @@ - 2024-05-23T07:38:15.931243 + 2024-05-23T07:52:54.138893 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -453,7 +453,7 @@ z - + @@ -479,7 +479,7 @@ z - + @@ -601,7 +601,7 @@ z - + @@ -674,7 +674,7 @@ z - + @@ -886,7 +886,7 @@ z - + @@ -1007,7 +1007,7 @@ z - + @@ -1043,16 +1043,16 @@ z +" clip-path="url(#pdd89e16c85)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1089,11 +1089,11 @@ z +" clip-path="url(#pdd89e16c85)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1135,11 +1135,11 @@ z +" clip-path="url(#pdd89e16c85)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1167,11 +1167,11 @@ z +" clip-path="url(#pdd89e16c85)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1186,11 +1186,11 @@ L 690 158.200121 +" clip-path="url(#pdd89e16c85)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1232,11 +1232,11 @@ z +" clip-path="url(#pdd89e16c85)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1411,7 +1411,7 @@ L 163.368917 273.70025 L 163.368917 71.190023 L 96.917045 71.190023 z -" clip-path="url(#p535a156c8f)" style="fill: #b3e6a8; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3e6a8; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3e6a8; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3d1e6; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3d1e6; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3d1e6; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3d1e6; opacity: 0.75"/> +" clip-path="url(#pdd89e16c85)" style="fill: #b3d1e6; opacity: 0.75"/> - + - + @@ -1556,7 +1556,7 @@ z - + @@ -1566,7 +1566,7 @@ z - + @@ -1576,7 +1576,7 @@ z - + @@ -1586,7 +1586,7 @@ z - + - + + diff --git a/benchmark/swe_bench_lite.py b/benchmark/swe_bench_lite.py index 26aca75f6..bac10552a 100644 --- a/benchmark/swe_bench_lite.py +++ b/benchmark/swe_bench_lite.py @@ -38,7 +38,7 @@ def plot_swe_bench_lite(data_file): yval = bar.get_height() ax.text( bar.get_x() + bar.get_width() / 2, - yval - 1.5, + yval - 1.25, f"{yval}%", ha="center", va="top",