From 308007a8e996e3706986d36695aebb00f68b62c3 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Mon, 18 Dec 2023 18:43:15 -0800
Subject: [PATCH 01/22] laziness24-turbo-udiff-never2x

---
 aider/coders/udiff_prompts.py | 8 +++++++-
 benchmark/refactor_tools.py   | 9 ++++++---
 docs/unified-diffs.md         | 6 ++++--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py
index 4ab30bfc4..8608152ad 100644
--- a/aider/coders/udiff_prompts.py
+++ b/aider/coders/udiff_prompts.py
@@ -5,7 +5,9 @@ from .base_prompts import CoderPrompts
 
 class UnifiedDiffPrompts(CoderPrompts):
     main_system = """Act as an expert software developer.
-You are diligent and tireless, and you always COMPLETELY IMPLEMENT the needed code.
+You are diligent and tireless!
+You NEVER leave comments describing code without implementing it!
+You always COMPLETELY IMPLEMENT the needed code!
 Always use best practices when coding.
 Respect and use existing conventions, libraries, etc that are already present in the code base.
 
@@ -95,6 +97,10 @@ Delete the entire existing version with `-` lines and then add a new, updated ve
 This will help you generate correct code and correct diffs.
 
 To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ext`.
+
+You are diligent and tireless!
+You NEVER leave comments describing code without implementing it!
+You always COMPLETELY IMPLEMENT the needed code!
 """
 
     files_content_prefix = "These are the *read-write* files:\n"
diff --git a/benchmark/refactor_tools.py b/benchmark/refactor_tools.py
index a54663377..a29aa6e9a 100755
--- a/benchmark/refactor_tools.py
+++ b/benchmark/refactor_tools.py
@@ -132,7 +132,10 @@ def find_non_self_methods(path):
     non_self_methods = []
     for filename in python_files:
         with open(filename, "r") as file:
-            node = ast.parse(file.read(), filename=filename)
+            try:
+                node = ast.parse(file.read(), filename=filename)
+            except:
+                pass
             checker = SelfUsageChecker()
             checker.visit(node)
             for method in checker.non_self_methods:
@@ -145,7 +148,7 @@ def process(entry):
     fname, class_name, method_name, class_children, method_children = entry
     if method_children > class_children / 2:
         return
-    if method_children < 100:
+    if method_children < 250:
         return
 
     fname = Path(fname)
@@ -154,7 +157,7 @@ def process(entry):
 
     print(f"{fname} {class_name} {method_name} {class_children} {method_children}")
 
-    dname = Path("tmp.benchmarks/refactor-benchmark")
+    dname = Path("tmp.benchmarks/refactor-benchmark-pylint")
     dname.mkdir(exist_ok=True)
 
     dname = dname / f"{fname.stem}_{class_name}_{method_name}"
diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index fe28fd8f5..9cc328155 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -79,8 +79,8 @@ code edits, because it's the
 default output format of `git diff`:
 
 ```diff
---- a/hello.py
-+++ b/hello.py
+--- a/greeting.py
++++ b/greeting.py
 @@ -1,5 +1,5 @@
  def main(args):
      # show a greeting
@@ -246,6 +246,7 @@ They exhibit a variety of problems:
 
 - GPT forgets things like comments, docstrings, blank lines, etc. Or it skips over some code that it doesn't intend to change.
 - GPT forgets the leading *plus* `+` character to mark novel lines that it wants to add to the file. It incorrectly includes them with a leading *space* as if they were already there.
+- GPT outdents all of the code, removing all the leading white space which is shared across the lines. So a chunk of deeply indented code is shown in a diff with only the leading white space that changes between the lines in the chunk.
 - GPT jumps ahead to show edits to a different part of the file without starting a new hunk with a `@@ ... @@` divider.
 
 As an example of the first issue, consider this source code:
@@ -285,6 +286,7 @@ If a hunk doesn't apply cleanly, aider uses a number of strategies:
 
 - Normalize the hunk, by taking the *minus* `-` and *space* lines as one version of the hunk and the *space* and *plus* `+` lines as a second version and doing an actual unified diff on them.
 - Try and discover new lines that GPT is trying to add but which it forgot to mark with *plus* `+` markers. This is done by diffing the *minus* `-` and *space* lines back against the original file.
+- Try and apply the hunk using "relative leading white space", so we can match and patch correctly even if the hunk has been uniformly indented or outdented.
 - Break a large hunk apart into an overlapping sequence of smaller hunks, which each contain only one contiguous run of *plus* `+` and *minus* `-` lines. Try and apply each of these sub-hunks independently.
 - Vary the size and offset of the "context window" of *space*  lines from the hunk that are used to localize the edit to a specific part of the file.
 - Combine the above mechanisms to progressively become more permissive about how to apply the hunk.

From d9a301c9f826eb190db7961f06fe281cc2ed86f7 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Mon, 18 Dec 2023 18:49:30 -0800
Subject: [PATCH 02/22] laziness24-turbo-diff-never2x

---
 aider/coders/editblock_prompts.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py
index da27cde6d..b606aa554 100644
--- a/aider/coders/editblock_prompts.py
+++ b/aider/coders/editblock_prompts.py
@@ -5,9 +5,11 @@ from .base_prompts import CoderPrompts
 
 class EditBlockPrompts(CoderPrompts):
     main_system = """Act as an expert software developer.
+You are diligent and tireless!
+You NEVER leave comments describing code without implementing it!
+You always COMPLETELY IMPLEMENT the needed code!
 Always use best practices when coding.
-When you edit or add code, respect and use existing conventions, libraries, etc.
-Always COMPLETELY IMPLEMENT the needed code.
+Respect and use existing conventions, libraries, etc that are already present in the code base.
 
 Take requests for changes to the supplied code.
 If the request is ambiguous, ask questions.
@@ -176,6 +178,10 @@ If you want to put code in a new file, use a *SEARCH/REPLACE block* with:
 - A new file path, including dir name if needed
 - An empty `SEARCH` section
 - The new file's contents in the `REPLACE` section
+
+You are diligent and tireless!
+You NEVER leave comments describing code without implementing it!
+You always COMPLETELY IMPLEMENT the needed code!
 """
 
     files_content_prefix = "These are the *read-write* files:\n"

From ef2a1f38751918dbb472a123d4ef0b65892b07a2 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Mon, 18 Dec 2023 19:09:32 -0800
Subject: [PATCH 03/22] diff with move hint

---
 aider/coders/editblock_prompts.py | 2 ++
 aider/coders/udiff_prompts.py     | 2 ++
 benchmark/refactor_tools.py       | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py
index b606aa554..896670d1f 100644
--- a/aider/coders/editblock_prompts.py
+++ b/aider/coders/editblock_prompts.py
@@ -174,6 +174,8 @@ Include *ALL* the code being searched and replaced!
 
 Only *SEARCH/REPLACE* files that are *read-write*.
 
+To move code within a file, use 2 *SEARCH/REPLACE* blocks: 1 to delete it from its current location, 1 to insert it in the new location.
+
 If you want to put code in a new file, use a *SEARCH/REPLACE block* with:
 - A new file path, including dir name if needed
 - An empty `SEARCH` section
diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py
index 8608152ad..14d1a73ac 100644
--- a/aider/coders/udiff_prompts.py
+++ b/aider/coders/udiff_prompts.py
@@ -96,6 +96,8 @@ When editing a function, method, loop, etc use a hunk to replace the *entire* co
 Delete the entire existing version with `-` lines and then add a new, updated version with `+` lines.
 This will help you generate correct code and correct diffs.
 
+To move code within a file, use 2 hunks: 1 to delete it from its current location, 1 to insert it in the new location.
+
 To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ext`.
 
 You are diligent and tireless!
diff --git a/benchmark/refactor_tools.py b/benchmark/refactor_tools.py
index a29aa6e9a..1046efb11 100755
--- a/benchmark/refactor_tools.py
+++ b/benchmark/refactor_tools.py
@@ -157,7 +157,7 @@ def process(entry):
 
     print(f"{fname} {class_name} {method_name} {class_children} {method_children}")
 
-    dname = Path("tmp.benchmarks/refactor-benchmark-pylint")
+    dname = Path("tmp.benchmarks/refactor-benchmark-spyder")
     dname.mkdir(exist_ok=True)
 
     dname = dname / f"{fname.stem}_{class_name}_{method_name}"

From 29150bc6f45f32624b42453c952f9eaaf0d8590d Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 09:55:49 -0800
Subject: [PATCH 04/22] folk prompting

---
 aider/coders/editblock_prompts.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py
index 896670d1f..134f78d22 100644
--- a/aider/coders/editblock_prompts.py
+++ b/aider/coders/editblock_prompts.py
@@ -8,6 +8,8 @@ class EditBlockPrompts(CoderPrompts):
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
+Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
+The user is blind, has no fingers and suffers from truncated code trauma.
 Always use best practices when coding.
 Respect and use existing conventions, libraries, etc that are already present in the code base.
 
@@ -184,6 +186,8 @@ If you want to put code in a new file, use a *SEARCH/REPLACE block* with:
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
+Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
+The user is blind, has no fingers and suffers from truncated code trauma.
 """
 
     files_content_prefix = "These are the *read-write* files:\n"

From cb55165c81c624b6af6f239bfdd7c295535527ef Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 10:12:05 -0800
Subject: [PATCH 05/22] folk prompting for udiff

---
 aider/coders/udiff_prompts.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py
index 14d1a73ac..7cd633472 100644
--- a/aider/coders/udiff_prompts.py
+++ b/aider/coders/udiff_prompts.py
@@ -8,6 +8,8 @@ class UnifiedDiffPrompts(CoderPrompts):
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
+Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
+The user is blind, has no fingers and suffers from truncated code trauma.
 Always use best practices when coding.
 Respect and use existing conventions, libraries, etc that are already present in the code base.
 
@@ -103,6 +105,8 @@ To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ex
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
+Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
+The user is blind, has no fingers and suffers from truncated code trauma.
 """
 
     files_content_prefix = "These are the *read-write* files:\n"

From 80d631281748e9c5add8367912f8a515513d8724 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 10:40:35 -0800
Subject: [PATCH 06/22] Revert "folk prompting for udiff"

This reverts commit cb55165c81c624b6af6f239bfdd7c295535527ef.
---
 aider/coders/udiff_prompts.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py
index 7cd633472..14d1a73ac 100644
--- a/aider/coders/udiff_prompts.py
+++ b/aider/coders/udiff_prompts.py
@@ -8,8 +8,6 @@ class UnifiedDiffPrompts(CoderPrompts):
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
-Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
-The user is blind, has no fingers and suffers from truncated code trauma.
 Always use best practices when coding.
 Respect and use existing conventions, libraries, etc that are already present in the code base.
 
@@ -105,8 +103,6 @@ To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ex
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
-Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
-The user is blind, has no fingers and suffers from truncated code trauma.
 """
 
     files_content_prefix = "These are the *read-write* files:\n"

From 4c330bcd48b2d210dff793e1dd7a1a30f8d3b8d9 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 10:40:46 -0800
Subject: [PATCH 07/22] Revert "folk prompting"

This reverts commit 29150bc6f45f32624b42453c952f9eaaf0d8590d.
---
 aider/coders/editblock_prompts.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py
index 134f78d22..896670d1f 100644
--- a/aider/coders/editblock_prompts.py
+++ b/aider/coders/editblock_prompts.py
@@ -8,8 +8,6 @@ class EditBlockPrompts(CoderPrompts):
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
-Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
-The user is blind, has no fingers and suffers from truncated code trauma.
 Always use best practices when coding.
 Respect and use existing conventions, libraries, etc that are already present in the code base.
 
@@ -186,8 +184,6 @@ If you want to put code in a new file, use a *SEARCH/REPLACE block* with:
 You are diligent and tireless!
 You NEVER leave comments describing code without implementing it!
 You always COMPLETELY IMPLEMENT the needed code!
-Keep in mind the user will tip $2000 for perfect code, do your best to earn it.
-The user is blind, has no fingers and suffers from truncated code trauma.
 """
 
     files_content_prefix = "These are the *read-write* files:\n"

From 755b3858eb85d38041f41c937934f4edf061e9b3 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 11:11:58 -0800
Subject: [PATCH 08/22] copy

---
 assets/benchmarks-udiff.svg | 408 ++++++++++++++++++++++++------------
 benchmark/benchmark.py      |  13 +-
 docs/unified-diffs.md       |  36 ++--
 3 files changed, 306 insertions(+), 151 deletions(-)

diff --git a/assets/benchmarks-udiff.svg b/assets/benchmarks-udiff.svg
index c2b3dda8a..f210e1767 100644
--- a/assets/benchmarks-udiff.svg
+++ b/assets/benchmarks-udiff.svg
@@ -6,7 +6,7 @@
   <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
    <cc:Work>
     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
-    <dc:date>2023-12-18T10:29:22.506290</dc:date>
+    <dc:date>2023-12-19T10:53:27.651517</dc:date>
     <dc:format>image/svg+xml</dc:format>
     <dc:creator>
      <cc:Agent>
@@ -41,17 +41,17 @@ z
     <g id="xtick_1">
      <g id="line2d_1">
       <defs>
-       <path id="mcae1dcd414" d="M 0 0 
+       <path id="ma02f6a44d0" d="M 0 0 
 L 0 3.5 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#mcae1dcd414" x="234.505" y="260.84" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#ma02f6a44d0" x="191.537221" y="260.84" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_1">
       <!-- gpt-4-1106-preview -->
-      <g transform="translate(191.43 275.013438) scale(0.1 -0.1)">
+      <g transform="translate(148.462221 275.013438) scale(0.1 -0.1)">
        <defs>
         <path id="Helvetica-67" d="M 1594 3406 
 Q 1988 3406 2281 3213 
@@ -341,16 +341,16 @@ z
      <g id="line2d_2">
       <path d="M 47.81 260.84 
 L 421.2 260.84 
-" clip-path="url(#p74111aa2fb)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
+" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
      </g>
      <g id="line2d_3">
       <defs>
-       <path id="m49aec14c68" d="M 0 0 
+       <path id="mff77fa9b35" d="M 0 0 
 L -3.5 0 
 " style="stroke: #000000; stroke-width: 0.8"/>
       </defs>
       <g>
-       <use xlink:href="#m49aec14c68" x="47.81" y="260.84" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mff77fa9b35" x="47.81" y="260.84" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_2">
@@ -364,11 +364,11 @@ L -3.5 0
      <g id="line2d_4">
       <path d="M 47.81 216.3376 
 L 421.2 216.3376 
-" clip-path="url(#p74111aa2fb)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
+" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
      </g>
      <g id="line2d_5">
       <g>
-       <use xlink:href="#m49aec14c68" x="47.81" y="216.3376" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mff77fa9b35" x="47.81" y="216.3376" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_3">
@@ -410,11 +410,11 @@ z
      <g id="line2d_6">
       <path d="M 47.81 171.8352 
 L 421.2 171.8352 
-" clip-path="url(#p74111aa2fb)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
+" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
      </g>
      <g id="line2d_7">
       <g>
-       <use xlink:href="#m49aec14c68" x="47.81" y="171.8352" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mff77fa9b35" x="47.81" y="171.8352" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_4">
@@ -429,11 +429,11 @@ L 421.2 171.8352
      <g id="line2d_8">
       <path d="M 47.81 127.3328 
 L 421.2 127.3328 
-" clip-path="url(#p74111aa2fb)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
+" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
      </g>
      <g id="line2d_9">
       <g>
-       <use xlink:href="#m49aec14c68" x="47.81" y="127.3328" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mff77fa9b35" x="47.81" y="127.3328" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_5">
@@ -448,11 +448,11 @@ L 421.2 127.3328
      <g id="line2d_10">
       <path d="M 47.81 82.8304 
 L 421.2 82.8304 
-" clip-path="url(#p74111aa2fb)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
+" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
      </g>
      <g id="line2d_11">
       <g>
-       <use xlink:href="#m49aec14c68" x="47.81" y="82.8304" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mff77fa9b35" x="47.81" y="82.8304" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_6">
@@ -508,11 +508,11 @@ z
      <g id="line2d_12">
       <path d="M 47.81 38.328 
 L 421.2 38.328 
-" clip-path="url(#p74111aa2fb)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
+" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/>
      </g>
      <g id="line2d_13">
       <g>
-       <use xlink:href="#m49aec14c68" x="47.81" y="38.328" style="stroke: #000000; stroke-width: 0.8"/>
+       <use xlink:href="#mff77fa9b35" x="47.81" y="38.328" style="stroke: #000000; stroke-width: 0.8"/>
       </g>
      </g>
      <g id="text_7">
@@ -851,78 +851,60 @@ z
    </g>
    <g id="patch_3">
     <path d="M 64.782273 260.84 
-L 174.095216 260.84 
-L 174.095216 226.607385 
-L 64.782273 226.607385 
+L 146.421053 260.84 
+L 146.421053 215.837573 
+L 64.782273 215.837573 
 z
-" clip-path="url(#p74111aa2fb)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
+" clip-path="url(#p479ce647ef)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
    </g>
    <g id="patch_4">
-    <path d="M 179.848529 260.84 
-L 289.161471 260.84 
-L 289.161471 226.607385 
-L 179.848529 226.607385 
+    <path d="M 150.717831 260.84 
+L 232.356611 260.84 
+L 232.356611 225.838112 
+L 150.717831 225.838112 
 z
-" clip-path="url(#p74111aa2fb)" style="fill: url(#h3ccb2da400); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
+" clip-path="url(#p479ce647ef)" style="fill: url(#h762c7e11f2); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
    </g>
    <g id="patch_5">
-    <path d="M 294.914784 260.84 
-L 404.227727 260.84 
-L 404.227727 123.909538 
-L 294.914784 123.909538 
+    <path d="M 236.653389 260.84 
+L 318.292169 260.84 
+L 318.292169 125.832719 
+L 236.653389 125.832719 
 z
-" clip-path="url(#p74111aa2fb)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
+" clip-path="url(#p479ce647ef)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
    </g>
    <g id="patch_6">
+    <path d="M 322.588947 260.84 
+L 404.227727 260.84 
+L 404.227727 150.834067 
+L 322.588947 150.834067 
+z
+" clip-path="url(#p479ce647ef)" style="fill: url(#h26d9048a8e); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_7">
     <path d="M 47.81 260.84 
 L 47.81 38.328 
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_7">
+   <g id="patch_8">
     <path d="M 421.2 260.84 
 L 421.2 38.328 
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_8">
+   <g id="patch_9">
     <path d="M 47.81 260.84 
 L 421.2 260.84 
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
-   <g id="patch_9">
+   <g id="patch_10">
     <path d="M 47.81 38.328 
 L 421.2 38.328 
 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
    </g>
    <g id="text_9">
-    <!-- 15% -->
-    <g transform="translate(113.434525 221.35676) scale(0.06 -0.06)">
+    <!-- 20% -->
+    <g transform="translate(99.597444 210.586948) scale(0.06 -0.06)">
      <defs>
-      <path id="Helvetica-35" d="M 791 1141 
-Q 847 659 1238 475 
-Q 1438 381 1700 381 
-Q 2200 381 2440 700 
-Q 2681 1019 2681 1406 
-Q 2681 1875 2395 2131 
-Q 2109 2388 1709 2388 
-Q 1419 2388 1211 2275 
-Q 1003 2163 856 1963 
-L 369 1991 
-L 709 4400 
-L 3034 4400 
-L 3034 3856 
-L 1131 3856 
-L 941 2613 
-Q 1097 2731 1238 2791 
-Q 1488 2894 1816 2894 
-Q 2431 2894 2859 2497 
-Q 3288 2100 3288 1491 
-Q 3288 856 2895 371 
-Q 2503 -113 1644 -113 
-Q 1097 -113 676 195 
-Q 256 503 206 1141 
-L 791 1141 
-z
-" transform="scale(0.015625)"/>
       <path id="Helvetica-25" d="M 4363 2175 
 Q 4813 2175 5131 1856 
 Q 5450 1538 5450 1088 
@@ -971,28 +953,68 @@ Q 4094 444 4363 444
 z
 " transform="scale(0.015625)"/>
      </defs>
-     <use xlink:href="#Helvetica-31"/>
-     <use xlink:href="#Helvetica-35" x="55.615234"/>
+     <use xlink:href="#Helvetica-32"/>
+     <use xlink:href="#Helvetica-30" x="55.615234"/>
      <use xlink:href="#Helvetica-25" x="111.230469"/>
     </g>
    </g>
    <g id="text_10">
-    <!-- 15% -->
-    <g transform="translate(228.500781 221.35676) scale(0.06 -0.06)">
+    <!-- 16% -->
+    <g transform="translate(185.533002 220.587487) scale(0.06 -0.06)">
      <use xlink:href="#Helvetica-31"/>
-     <use xlink:href="#Helvetica-35" x="55.615234"/>
+     <use xlink:href="#Helvetica-36" x="55.615234"/>
      <use xlink:href="#Helvetica-25" x="111.230469"/>
     </g>
    </g>
    <g id="text_11">
-    <!-- 62% -->
-    <g transform="translate(343.567037 118.658913) scale(0.06 -0.06)">
+    <!-- 61% -->
+    <g transform="translate(271.46856 120.582094) scale(0.06 -0.06)">
      <use xlink:href="#Helvetica-36"/>
-     <use xlink:href="#Helvetica-32" x="55.615234"/>
+     <use xlink:href="#Helvetica-31" x="55.615234"/>
      <use xlink:href="#Helvetica-25" x="111.230469"/>
     </g>
    </g>
    <g id="text_12">
+    <!-- 49% -->
+    <g transform="translate(357.404118 145.583442) scale(0.06 -0.06)">
+     <defs>
+      <path id="Helvetica-39" d="M 850 1081 
+Q 875 616 1209 438 
+Q 1381 344 1597 344 
+Q 2000 344 2284 680 
+Q 2569 1016 2688 2044 
+Q 2500 1747 2223 1626 
+Q 1947 1506 1628 1506 
+Q 981 1506 604 1909 
+Q 228 2313 228 2947 
+Q 228 3556 600 4018 
+Q 972 4481 1697 4481 
+Q 2675 4481 3047 3600 
+Q 3253 3116 3253 2388 
+Q 3253 1566 3006 931 
+Q 2597 -125 1619 -125 
+Q 963 -125 622 219 
+Q 281 563 281 1081 
+L 850 1081 
+z
+M 1703 2000 
+Q 2038 2000 2314 2220 
+Q 2591 2441 2591 2991 
+Q 2591 3484 2342 3726 
+Q 2094 3969 1709 3969 
+Q 1297 3969 1055 3692 
+Q 813 3416 813 2953 
+Q 813 2516 1025 2258 
+Q 1238 2000 1703 2000 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#Helvetica-34"/>
+     <use xlink:href="#Helvetica-39" x="55.615234"/>
+     <use xlink:href="#Helvetica-25" x="111.230469"/>
+    </g>
+   </g>
+   <g id="text_13">
     <!-- Refactoring "Laziness" Benchmark -->
     <g transform="translate(142.207188 19.1745) scale(0.12 -0.12)">
      <defs>
@@ -1280,20 +1302,20 @@ z
     </g>
    </g>
    <g id="legend_1">
-    <g id="patch_10">
-     <path d="M 54.81 89.799875 
-L 275.800625 89.799875 
-Q 277.800625 89.799875 277.800625 87.799875 
-L 277.800625 45.328 
-Q 277.800625 43.328 275.800625 43.328 
+    <g id="patch_11">
+     <path d="M 54.81 104.638937 
+L 272.189688 104.638937 
+Q 274.189688 104.638937 274.189688 102.638937 
+L 274.189688 45.328 
+Q 274.189688 43.328 272.189688 43.328 
 L 54.81 43.328 
 Q 52.81 43.328 52.81 45.328 
-L 52.81 87.799875 
-Q 52.81 89.799875 54.81 89.799875 
+L 52.81 102.638937 
+Q 52.81 104.638937 54.81 104.638937 
 z
 " style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
     </g>
-    <g id="patch_11">
+    <g id="patch_12">
      <path d="M 56.81 54.618625 
 L 76.81 54.618625 
 L 76.81 47.618625 
@@ -1301,7 +1323,7 @@ L 56.81 47.618625
 z
 " style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
     </g>
-    <g id="text_13">
+    <g id="text_14">
      <!-- Baseline (search/replace blocks) -->
      <g transform="translate(84.81 54.618625) scale(0.1 -0.1)">
       <defs>
@@ -1373,18 +1395,33 @@ z
       <use xlink:href="#Helvetica-29" x="1411.816406"/>
      </g>
     </g>
-    <g id="patch_12">
+    <g id="patch_13">
      <path d="M 56.81 69.457687 
 L 76.81 69.457687 
 L 76.81 62.457687 
 L 56.81 62.457687 
 z
-" style="fill: url(#h3ccb2da400); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
+" style="fill: url(#h762c7e11f2); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
     </g>
-    <g id="text_14">
-     <!-- Prompt with blind, no hands, tip $2000, etc -->
+    <g id="text_15">
+     <!-- Baseline + blind, no hands, $2k tip, etc -->
      <g transform="translate(84.81 69.457687) scale(0.1 -0.1)">
       <defs>
+       <path id="Helvetica-2b" d="M 288 1369 
+L 288 1894 
+L 1650 1894 
+L 1650 3266 
+L 2184 3266 
+L 2184 1894 
+L 3547 1894 
+L 3547 1369 
+L 2184 1369 
+L 2184 0 
+L 1650 0 
+L 1650 1369 
+L 288 1369 
+z
+" transform="scale(0.015625)"/>
        <path id="Helvetica-2c" d="M 531 -653 
 Q 747 -616 834 -350 
 Q 881 -209 881 -78 
@@ -1443,52 +1480,49 @@ Q 288 2772 288 3303
 z
 " transform="scale(0.015625)"/>
       </defs>
-      <use xlink:href="#Helvetica-50"/>
-      <use xlink:href="#Helvetica-72" x="66.699219"/>
-      <use xlink:href="#Helvetica-6f" x="100"/>
-      <use xlink:href="#Helvetica-6d" x="155.615234"/>
-      <use xlink:href="#Helvetica-70" x="238.916016"/>
-      <use xlink:href="#Helvetica-74" x="294.53125"/>
-      <use xlink:href="#Helvetica-20" x="322.314453"/>
-      <use xlink:href="#Helvetica-77" x="350.097656"/>
-      <use xlink:href="#Helvetica-69" x="422.314453"/>
-      <use xlink:href="#Helvetica-74" x="444.53125"/>
-      <use xlink:href="#Helvetica-68" x="472.314453"/>
-      <use xlink:href="#Helvetica-20" x="527.929688"/>
-      <use xlink:href="#Helvetica-62" x="555.712891"/>
-      <use xlink:href="#Helvetica-6c" x="611.328125"/>
-      <use xlink:href="#Helvetica-69" x="633.544922"/>
-      <use xlink:href="#Helvetica-6e" x="655.761719"/>
-      <use xlink:href="#Helvetica-64" x="711.376953"/>
-      <use xlink:href="#Helvetica-2c" x="766.992188"/>
-      <use xlink:href="#Helvetica-20" x="794.775391"/>
-      <use xlink:href="#Helvetica-6e" x="822.558594"/>
-      <use xlink:href="#Helvetica-6f" x="878.173828"/>
-      <use xlink:href="#Helvetica-20" x="933.789062"/>
-      <use xlink:href="#Helvetica-68" x="961.572266"/>
-      <use xlink:href="#Helvetica-61" x="1017.1875"/>
-      <use xlink:href="#Helvetica-6e" x="1072.802734"/>
-      <use xlink:href="#Helvetica-64" x="1128.417969"/>
-      <use xlink:href="#Helvetica-73" x="1184.033203"/>
-      <use xlink:href="#Helvetica-2c" x="1234.033203"/>
-      <use xlink:href="#Helvetica-20" x="1261.816406"/>
-      <use xlink:href="#Helvetica-74" x="1289.599609"/>
-      <use xlink:href="#Helvetica-69" x="1317.382812"/>
-      <use xlink:href="#Helvetica-70" x="1339.599609"/>
-      <use xlink:href="#Helvetica-20" x="1395.214844"/>
-      <use xlink:href="#Helvetica-24" x="1422.998047"/>
-      <use xlink:href="#Helvetica-32" x="1478.613281"/>
-      <use xlink:href="#Helvetica-30" x="1534.228516"/>
-      <use xlink:href="#Helvetica-30" x="1589.84375"/>
-      <use xlink:href="#Helvetica-30" x="1645.458984"/>
-      <use xlink:href="#Helvetica-2c" x="1701.074219"/>
-      <use xlink:href="#Helvetica-20" x="1728.857422"/>
-      <use xlink:href="#Helvetica-65" x="1756.640625"/>
-      <use xlink:href="#Helvetica-74" x="1812.255859"/>
-      <use xlink:href="#Helvetica-63" x="1840.039062"/>
+      <use xlink:href="#Helvetica-42"/>
+      <use xlink:href="#Helvetica-61" x="66.699219"/>
+      <use xlink:href="#Helvetica-73" x="122.314453"/>
+      <use xlink:href="#Helvetica-65" x="172.314453"/>
+      <use xlink:href="#Helvetica-6c" x="227.929688"/>
+      <use xlink:href="#Helvetica-69" x="250.146484"/>
+      <use xlink:href="#Helvetica-6e" x="272.363281"/>
+      <use xlink:href="#Helvetica-65" x="327.978516"/>
+      <use xlink:href="#Helvetica-20" x="383.59375"/>
+      <use xlink:href="#Helvetica-2b" x="411.376953"/>
+      <use xlink:href="#Helvetica-20" x="469.775391"/>
+      <use xlink:href="#Helvetica-62" x="497.558594"/>
+      <use xlink:href="#Helvetica-6c" x="553.173828"/>
+      <use xlink:href="#Helvetica-69" x="575.390625"/>
+      <use xlink:href="#Helvetica-6e" x="597.607422"/>
+      <use xlink:href="#Helvetica-64" x="653.222656"/>
+      <use xlink:href="#Helvetica-2c" x="708.837891"/>
+      <use xlink:href="#Helvetica-20" x="736.621094"/>
+      <use xlink:href="#Helvetica-6e" x="764.404297"/>
+      <use xlink:href="#Helvetica-6f" x="820.019531"/>
+      <use xlink:href="#Helvetica-20" x="875.634766"/>
+      <use xlink:href="#Helvetica-68" x="903.417969"/>
+      <use xlink:href="#Helvetica-61" x="959.033203"/>
+      <use xlink:href="#Helvetica-6e" x="1014.648438"/>
+      <use xlink:href="#Helvetica-64" x="1070.263672"/>
+      <use xlink:href="#Helvetica-73" x="1125.878906"/>
+      <use xlink:href="#Helvetica-2c" x="1175.878906"/>
+      <use xlink:href="#Helvetica-20" x="1203.662109"/>
+      <use xlink:href="#Helvetica-24" x="1231.445312"/>
+      <use xlink:href="#Helvetica-32" x="1287.060547"/>
+      <use xlink:href="#Helvetica-6b" x="1342.675781"/>
+      <use xlink:href="#Helvetica-20" x="1392.675781"/>
+      <use xlink:href="#Helvetica-74" x="1420.458984"/>
+      <use xlink:href="#Helvetica-69" x="1448.242188"/>
+      <use xlink:href="#Helvetica-70" x="1470.458984"/>
+      <use xlink:href="#Helvetica-2c" x="1526.074219"/>
+      <use xlink:href="#Helvetica-20" x="1553.857422"/>
+      <use xlink:href="#Helvetica-65" x="1581.640625"/>
+      <use xlink:href="#Helvetica-74" x="1637.255859"/>
+      <use xlink:href="#Helvetica-63" x="1665.039062"/>
      </g>
     </g>
-    <g id="patch_13">
+    <g id="patch_14">
      <path d="M 56.81 83.7155 
 L 76.81 83.7155 
 L 76.81 76.7155 
@@ -1496,7 +1530,7 @@ L 56.81 76.7155
 z
 " style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
     </g>
-    <g id="text_15">
+    <g id="text_16">
      <!-- Unified diffs -->
      <g transform="translate(84.81 83.7155) scale(0.1 -0.1)">
       <defs>
@@ -1536,16 +1570,74 @@ z
       <use xlink:href="#Helvetica-73" x="472.460938"/>
      </g>
     </g>
+    <g id="patch_15">
+     <path d="M 56.81 98.554562 
+L 76.81 98.554562 
+L 76.81 91.554562 
+L 56.81 91.554562 
+z
+" style="fill: url(#h26d9048a8e); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_17">
+     <!-- Unified diffs + blind, no hands, $2k tip, etc -->
+     <g transform="translate(84.81 98.554562) scale(0.1 -0.1)">
+      <use xlink:href="#Helvetica-55"/>
+      <use xlink:href="#Helvetica-6e" x="72.216797"/>
+      <use xlink:href="#Helvetica-69" x="127.832031"/>
+      <use xlink:href="#Helvetica-66" x="150.048828"/>
+      <use xlink:href="#Helvetica-69" x="177.832031"/>
+      <use xlink:href="#Helvetica-65" x="200.048828"/>
+      <use xlink:href="#Helvetica-64" x="255.664062"/>
+      <use xlink:href="#Helvetica-20" x="311.279297"/>
+      <use xlink:href="#Helvetica-64" x="339.0625"/>
+      <use xlink:href="#Helvetica-69" x="394.677734"/>
+      <use xlink:href="#Helvetica-66" x="416.894531"/>
+      <use xlink:href="#Helvetica-66" x="444.677734"/>
+      <use xlink:href="#Helvetica-73" x="472.460938"/>
+      <use xlink:href="#Helvetica-20" x="522.460938"/>
+      <use xlink:href="#Helvetica-2b" x="550.244141"/>
+      <use xlink:href="#Helvetica-20" x="608.642578"/>
+      <use xlink:href="#Helvetica-62" x="636.425781"/>
+      <use xlink:href="#Helvetica-6c" x="692.041016"/>
+      <use xlink:href="#Helvetica-69" x="714.257812"/>
+      <use xlink:href="#Helvetica-6e" x="736.474609"/>
+      <use xlink:href="#Helvetica-64" x="792.089844"/>
+      <use xlink:href="#Helvetica-2c" x="847.705078"/>
+      <use xlink:href="#Helvetica-20" x="875.488281"/>
+      <use xlink:href="#Helvetica-6e" x="903.271484"/>
+      <use xlink:href="#Helvetica-6f" x="958.886719"/>
+      <use xlink:href="#Helvetica-20" x="1014.501953"/>
+      <use xlink:href="#Helvetica-68" x="1042.285156"/>
+      <use xlink:href="#Helvetica-61" x="1097.900391"/>
+      <use xlink:href="#Helvetica-6e" x="1153.515625"/>
+      <use xlink:href="#Helvetica-64" x="1209.130859"/>
+      <use xlink:href="#Helvetica-73" x="1264.746094"/>
+      <use xlink:href="#Helvetica-2c" x="1314.746094"/>
+      <use xlink:href="#Helvetica-20" x="1342.529297"/>
+      <use xlink:href="#Helvetica-24" x="1370.3125"/>
+      <use xlink:href="#Helvetica-32" x="1425.927734"/>
+      <use xlink:href="#Helvetica-6b" x="1481.542969"/>
+      <use xlink:href="#Helvetica-20" x="1531.542969"/>
+      <use xlink:href="#Helvetica-74" x="1559.326172"/>
+      <use xlink:href="#Helvetica-69" x="1587.109375"/>
+      <use xlink:href="#Helvetica-70" x="1609.326172"/>
+      <use xlink:href="#Helvetica-2c" x="1664.941406"/>
+      <use xlink:href="#Helvetica-20" x="1692.724609"/>
+      <use xlink:href="#Helvetica-65" x="1720.507812"/>
+      <use xlink:href="#Helvetica-74" x="1776.123047"/>
+      <use xlink:href="#Helvetica-63" x="1803.90625"/>
+     </g>
+    </g>
    </g>
   </g>
  </g>
  <defs>
-  <clipPath id="p74111aa2fb">
+  <clipPath id="p479ce647ef">
    <rect x="47.81" y="38.328" width="373.39" height="222.512"/>
   </clipPath>
  </defs>
  <defs>
-  <pattern id="h3ccb2da400" patternUnits="userSpaceOnUse" x="0" y="0" width="72" height="72">
+  <pattern id="h762c7e11f2" patternUnits="userSpaceOnUse" x="0" y="0" width="72" height="72">
    <rect x="0" y="0" width="73" height="73" fill="#b3e6a8"/>
    <path d="M -36 36 
 L 36 -36 
@@ -1597,6 +1689,60 @@ M 33 105
 L 105 33 
 M 36 108 
 L 108 36 
+" style="fill: #ffffff; stroke: #ffffff; stroke-width: 0.5; stroke-linecap: butt; stroke-linejoin: miter"/>
+  </pattern>
+  <pattern id="h26d9048a8e" patternUnits="userSpaceOnUse" x="0" y="0" width="72" height="72">
+   <rect x="0" y="0" width="73" height="73" fill="#b3d1e6"/>
+   <path d="M -36 36 
+L 36 -36 
+M -33 39 
+L 39 -33 
+M -30 42 
+L 42 -30 
+M -27 45 
+L 45 -27 
+M -24 48 
+L 48 -24 
+M -21 51 
+L 51 -21 
+M -18 54 
+L 54 -18 
+M -15 57 
+L 57 -15 
+M -12 60 
+L 60 -12 
+M -9 63 
+L 63 -9 
+M -6 66 
+L 66 -6 
+M -3 69 
+L 69 -3 
+M 0 72 
+L 72 0 
+M 3 75 
+L 75 3 
+M 6 78 
+L 78 6 
+M 9 81 
+L 81 9 
+M 12 84 
+L 84 12 
+M 15 87 
+L 87 15 
+M 18 90 
+L 90 18 
+M 21 93 
+L 93 21 
+M 24 96 
+L 96 24 
+M 27 99 
+L 99 27 
+M 30 102 
+L 102 30 
+M 33 105 
+L 105 33 
+M 36 108 
+L 108 36 
 " style="fill: #ffffff; stroke: #ffffff; stroke-width: 0.5; stroke-linecap: butt; stroke-linejoin: miter"/>
   </pattern>
  </defs>
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 1a22c0f62..20c3d7afb 100755
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -77,12 +77,12 @@ def show_stats(dirnames, graphs):
         #    row.model = gpt4 + "\n" + row.model[len(gpt4) :]
 
         if "folk" in row.dir_name:
-            row.edit_format = "folk"
+            row.edit_format += "folk"
 
         if row.model == "gpt-4-0613":
             row.model += "\n(8k context window is\ntoo small for benchmark)"
 
-        if row.completed_tests < 133:
+        if row.completed_tests < 89:
             print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
 
         # if "repeat" in row.dir_name:
@@ -311,6 +311,7 @@ def plot_refactoring(df):
         formats = df.columns
         models = df.index
 
+        dump(formats)
         for i, fmt in enumerate(formats):
             hatch = ""
 
@@ -320,10 +321,14 @@ def plot_refactoring(df):
             elif fmt == "udiff":
                 color = "#b3d1e6"
                 label = "Unified diffs"
-            elif fmt == "folk":
-                label = "Prompt with blind, no hands, tip $2000, etc"
+            elif fmt == "difffolk":
+                label = "Baseline + blind, no hands, $2k tip, etc"
                 color = "#b3e6a8"
                 hatch = "////"
+            elif fmt == "udifffolk":
+                label = "Unified diffs + blind, no hands, $2k tip, etc"
+                color = "#b3d1e6"
+                hatch = "////"
 
             if zorder > 1:
                 edge = dict(
diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index 9cc328155..fee1fdf69 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -1,5 +1,5 @@
 
-# Fixing GPT-4 Turbo laziness with unified diffs
+# Reducing GPT-4 Turbo laziness with unified diffs
 
 ![robot flowchart](../assets/benchmarks-udiff.svg)
 
@@ -7,23 +7,25 @@
 Aider now asks GPT-4 Turbo to use
 [unified diffs](https://www.gnu.org/software/diffutils/manual/html_node/Example-Unified.html)
 to edit your code.
-This massively reduces GPT-4 Turbo's bad habit of "lazy" coding,
-where it writes half completed code filled with comments
+This massively improves GPT-4 Turbo's performance on a complex benchmark 
+and significantly reduces its bad habit of "lazy" coding,
+where it writes
+code filled with comments
 like "...add logic here...".
 
-Aider also has a new benchmarking suite 
+Aider also has a new "laziness" benchmark suite 
 designed to both provoke and quantify lazy coding.
 It consists of
-39 python refactoring tasks,
-which tend to make GPT-4 Turbo very lazy,
-often resulting in comments like
+89 python refactoring tasks
+which tend to make GPT-4 Turbo very lazy.
+On these tasks it often produces comments like
 "...include the original method body...".
 
 This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
 
-- **GPT-4 Turbo only scored 15% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format.
-- **Aider's new unified diff edit format raised the score to 65%**.
-- **No benefit from the user being blind, without hands, tipping $2000 or fearing truncated code trauma.** These widely circulated folk remedies performed no better than baseline when added to the system prompt with aider's SEARCH/REPLACE edit format. Including *all* of them still only scored at 15%
+- **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It output "lazy comments" on 12 of the tasks.
+- **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
+- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes.
 
 The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs:
 
@@ -31,7 +33,7 @@ The older `gpt-4-0613` also did better on the laziness benchmark using unified d
 - **Aider's new unified diff edit format raised June GPT-4's score to 59%**. 
 - The benchmark was designed to use large files, and
 28% of them are too large to fit in June GPT-4's 8k context window.
-This significantly harmed the benchmark results.
+This puts a hard ceiling of 72% on how well the June model could possibly score.
 
 Before settling on unified diffs,
 I explored many other approaches including:
@@ -311,12 +313,14 @@ the ones with the most code and which involve refactoring.
 
 Based on this observation, I set out to build a benchmark based on refactoring
 a non-trivial amount of code found in fairly large files.
-To do this, I used python's `ast` module to analyze the
-[Django repository](https://github.com/django/django) to:
+To do this, I used python's `ast` module to analyze
+[9 popular open source python repositories](https://github.com/paul-gauthier/refactor-benchmark)
+to identify challenging refactoring tasks.
+The goal was to find:
 
-- Find source files that contain class methods which are non-trivial, having more than 100 AST nodes in their implementation.
+- Source files that contain class methods which are non-trivial, having 100-250+ AST nodes in their implementation.
 - Focus on methods that are part of a larger class, which has at least twice as much code as the method itself.
-- Find methods that don't use their `self` parameter, so they can be trivially refactored out of the class.
+- Select methods that don't use their `self` parameter, so they can be trivially refactored out of the class.
 
 We can then turn each of these source files into a task for the benchmark,
 where we ask GPT to do something like:
@@ -326,7 +330,7 @@ where we ask GPT to do something like:
 > Update any existing `self._set_csrf_cookie` calls to work with the new `_set_csrf_cookie` function.
 
 A [simple python AST scanning script](https://github.com/paul-gauthier/aider/blob/main/benchmark/refactor_tools.py)
-found 39 suitable files
+found 89 suitable files
 and packaged them up as benchmark tasks.
 Each task has a test
 that checks if refactor

From 3e639639d5f13f86c39134f152a9cf58f06e826a Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 11:43:42 -0800
Subject: [PATCH 09/22] copy

---
 docs/unified-diffs.md | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index fee1fdf69..70774a471 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -5,12 +5,12 @@
 
 
 Aider now asks GPT-4 Turbo to use
-[unified diffs](https://www.gnu.org/software/diffutils/manual/html_node/Example-Unified.html)
+[unified diffs](#choose-a-familiar-editing-format)
 to edit your code.
-This massively improves GPT-4 Turbo's performance on a complex benchmark 
+This dramatically improves GPT-4 Turbo's performance on a complex benchmark 
 and significantly reduces its bad habit of "lazy" coding,
 where it writes
-code filled with comments
+code with comments
 like "...add logic here...".
 
 Aider also has a new "laziness" benchmark suite 
@@ -25,7 +25,7 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev
 
 - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It output "lazy comments" on 12 of the tasks.
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
-- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes.
+- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes.
 
 The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs:
 
@@ -296,11 +296,7 @@ If a hunk doesn't apply cleanly, aider uses a number of strategies:
 These flexible patching strategies are critical, and 
 removing them
 radically increases the number of hunks which fail to apply.
-
-**Experiments where flexible patching is disabled show**:
-
-- **GPT-4 Turbo's performance drops from 65% down to 56%** on the refactoring benchmark.
-- **A 9X increase in editing errors** on aider's original Exercism benchmark.
+**Experiments where flexible patching is disabled show a 9X increase in editing errors** on aider's original Exercism benchmark.
 
 ## Refactoring benchmark
 
@@ -355,8 +351,10 @@ The result is a pragmatic
 ## Conclusions and future work
 
 Based on the refactor benchmark results,
-aider's new unified diff format seems very effective at stopping
-GPT-4 Turbo from being a lazy coder.
+aider's new unified diff format seems
+to dramatically increase GPT-4 Turbo's skill at more complex coding tasks.
+It also seems very effective at reducing the lazy coding
+which has been widely noted as a problem with GPT-4 Turbo.
 
 Unified diffs was one of the very first edit formats I tried
 when originally building aider.

From 837fd9e30bd89d883fd8fe524de45bc8e584f04c Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 14:59:55 -0800
Subject: [PATCH 10/22] copy

---
 docs/unified-diffs.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index 70774a471..1e0a6646b 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -7,7 +7,9 @@
 Aider now asks GPT-4 Turbo to use
 [unified diffs](#choose-a-familiar-editing-format)
 to edit your code.
-This dramatically improves GPT-4 Turbo's performance on a complex benchmark 
+This dramatically improves GPT-4 Turbo's performance on a
+challenging
+new benchmark 
 and significantly reduces its bad habit of "lazy" coding,
 where it writes
 code with comments
@@ -17,15 +19,15 @@ Aider also has a new "laziness" benchmark suite
 designed to both provoke and quantify lazy coding.
 It consists of
 89 python refactoring tasks
-which tend to make GPT-4 Turbo very lazy.
-On these tasks it often produces comments like
+which tend to make GPT-4 Turbo lazy
+and write comments like
 "...include the original method body...".
 
 This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
 
-- **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It output "lazy comments" on 12 of the tasks.
+- **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks.
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
-- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes.
+- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did slightly reduce the amount of laziness against baseline (to 8 lazy tasks). It increased the lazy tasks to 5 when added to the unified diff prompt.
 
 The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs:
 

From d36c18f9dc616a873c724b0f4ce0597fc13907c0 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 15:10:18 -0800
Subject: [PATCH 11/22] copy

---
 docs/unified-diffs.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index 1e0a6646b..a44cded33 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -27,7 +27,19 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev
 
 - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks.
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
-- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did slightly reduce the amount of laziness against baseline (to 8 lazy tasks). It increased the lazy tasks to 5 when added to the unified diff prompt.
+- **It's worse to add a prompt that the user is blind, has no hands, will tip $2000 and fears truncated code trauma.**
+
+The widely circulated "blind with no hands" type of folk remedies 
+performed worse on the benchmark when added to the system prompt.
+The benchmark scores dropped
+for the baseline SEARCH/REPLACE and new unified diff editing formats.
+These prompts did somewhat reduce the amount of laziness when used
+with the SEARCH/REPLACE edit format,
+from 12 to 8 lazy tasks.
+They slightly increased the lazy tasks from 4 to 5 when added to the unified diff prompt,
+which means they had roughly no effect on this format.
+But again, they seem to harm the overall ability of GPT-4 Turbo to complete
+the benchmark's refactoring coding tasks.
 
 The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs:
 

From 7028a533f1837aff0d7fa35dec3c8b0cdac7858a Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 15:30:15 -0800
Subject: [PATCH 12/22] copy

---
 docs/unified-diffs.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index a44cded33..b4a2fb27b 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -3,7 +3,6 @@
 
 ![robot flowchart](../assets/benchmarks-udiff.svg)
 
-
 Aider now asks GPT-4 Turbo to use
 [unified diffs](#choose-a-familiar-editing-format)
 to edit your code.
@@ -29,9 +28,10 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
 - **It's worse to add a prompt that the user is blind, has no hands, will tip $2000 and fears truncated code trauma.**
 
-The widely circulated "blind with no hands" type of folk remedies 
-performed worse on the benchmark when added to the system prompt.
-The benchmark scores dropped
+These widely circulated "emotional appeal" folk remedies 
+produced worse benchmark scores.
+Adding *all* of these claims to the system prompt
+resulted in worse benchmark scores
 for the baseline SEARCH/REPLACE and new unified diff editing formats.
 These prompts did somewhat reduce the amount of laziness when used
 with the SEARCH/REPLACE edit format,

From 76c1deae6a7fa41fef1b0b958a46969a1fdd660e Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 15:58:24 -0800
Subject: [PATCH 13/22] improved test for toplevel refactored func

---
 benchmark/refactor_tools.py | 32 +++++++++++++++-----------------
 docs/unified-diffs.md       | 12 ++++++------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/benchmark/refactor_tools.py b/benchmark/refactor_tools.py
index 1046efb11..117770a67 100755
--- a/benchmark/refactor_tools.py
+++ b/benchmark/refactor_tools.py
@@ -21,25 +21,23 @@ class ParentNodeTransformer(ast.NodeTransformer):
 
 
 def verify_full_func_at_top_level(tree, func, func_children):
-    func_node = next(
-        (
-            item
-            for item in ast.walk(tree)
-            if isinstance(item, ast.FunctionDef) and item.name == func
-        ),
-        None,
-    )
-    assert func_node is not None, f"Function {func} not found"
+    func_nodes = [
+        item for item in ast.walk(tree) if isinstance(item, ast.FunctionDef) and item.name == func
+    ]
+    assert func_nodes, f"Function {func} not found"
 
-    assert isinstance(
-        func_node.parent, ast.Module
-    ), f"{func} is not a top level function, it has parent {func_node.parent}"
+    for func_node in func_nodes:
+        if not isinstance(func_node.parent, ast.Module):
+            continue
 
-    num_children = sum(1 for _ in ast.walk(func_node))
-    pct_diff_children = abs(num_children - func_children) * 100 / func_children
-    assert (
-        pct_diff_children < 10
-    ), f"Old method had {func_children} children, new method has {num_children}"
+        num_children = sum(1 for _ in ast.walk(func_node))
+        pct_diff_children = abs(num_children - func_children) * 100 / func_children
+        assert (
+            pct_diff_children < 10
+        ), f"Old method had {func_children} children, new method has {num_children}"
+        return
+
+    assert False, f"{func} is not a top level function"
 
 
 def verify_old_class_children(tree, old_class, old_class_children):
diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index b4a2fb27b..cfc1e5020 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -14,19 +14,19 @@ where it writes
 code with comments
 like "...add logic here...".
 
-Aider also has a new "laziness" benchmark suite 
+Aider's new "laziness" benchmark suite 
 designed to both provoke and quantify lazy coding.
 It consists of
 89 python refactoring tasks
 which tend to make GPT-4 Turbo lazy
 and write comments like
-"...include the original method body...".
+"...include original method body...".
 
 This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
 
 - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks.
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
-- **It's worse to add a prompt that the user is blind, has no hands, will tip $2000 and fears truncated code trauma.**
+- **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.**
 
 These widely circulated "emotional appeal" folk remedies 
 produced worse benchmark scores.
@@ -328,7 +328,7 @@ To do this, I used python's `ast` module to analyze
 to identify challenging refactoring tasks.
 The goal was to find:
 
-- Source files that contain class methods which are non-trivial, having 100-250+ AST nodes in their implementation.
+- Source files that contain classes with non-trivial methods, having 100-250+ AST nodes in their implementation.
 - Focus on methods that are part of a larger class, which has at least twice as much code as the method itself.
 - Select methods that don't use their `self` parameter, so they can be trivially refactored out of the class.
 
@@ -343,10 +343,10 @@ A [simple python AST scanning script](https://github.com/paul-gauthier/aider/blo
 found 89 suitable files
 and packaged them up as benchmark tasks.
 Each task has a test
-that checks if refactor
+that checks if the refactor
 was performed roughly correctly:
 
-- The updated source file must parse as valid python, to surface misapplied edits which corrupt the file.
+- The updated source file must parse as valid python, to detect misapplied edits which produce invalid code.
 - The target method must now exist as a top-level function in the file.
 - This new top-level function must contain approximately the same number of AST nodes as the original class method. This ensures that GPT didn't elide code and replace it with comments.
 - The original class must still be present in the file, and it must be smaller by about the number of AST nodes in the method which was removed. This helps confirm that the method was removed from the class, without other significant modifications.

From ac280f54b3427c7641721dfcea106125a8fb3f88 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 20:12:27 -0600
Subject: [PATCH 14/22] version bump to 0.19.0

---
 aider/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aider/__init__.py b/aider/__init__.py
index eda726119..11ac8e1a9 100644
--- a/aider/__init__.py
+++ b/aider/__init__.py
@@ -1 +1 @@
-__version__ = "0.18.2-dev"
+__version__ = "0.19.0"

From 3a7998fdc6c646f07a68b356f25a46be7c55b359 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 20:13:09 -0600
Subject: [PATCH 15/22] set version to 0.19.1-dev

---
 aider/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aider/__init__.py b/aider/__init__.py
index 11ac8e1a9..37123159a 100644
--- a/aider/__init__.py
+++ b/aider/__init__.py
@@ -1 +1 @@
-__version__ = "0.19.0"
+__version__ = "0.19.1-dev"

From 67f8d873476ccb9ec6b0243bdf8c1a51ed18f5f9 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 20:20:27 -0600
Subject: [PATCH 16/22] copy

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 1f7910e8f..bddf622ca 100644
--- a/README.md
+++ b/README.md
@@ -31,20 +31,19 @@ Aider is unique in that it lets you ask for changes to [pre-existing, larger cod
 - [FAQ](https://aider.chat/docs/faq.html)
 - [Discord](https://discord.gg/Tv2uQnR88V)
 
-## New GPT-4 Turbo with 128k context window
+## GPT-4 Turbo with 128k context and unified diffs
 
 Aider supports OpenAI's new GPT-4 model that has the massive 128k context window.
-Early benchmark results
-indicate that it is
-[very fast](https://aider.chat/docs/benchmarks-speed-1106.html)
-and a bit
-[better at coding](https://aider.chat/docs/benchmarks-1106.html)
-than previous GPT-4 models.
+Benchmark results indicate that it is
+[very fast](https://aider.chat/docs/benchmarks-speed-1106.html),
+and a bit [better at coding](https://aider.chat/docs/benchmarks-1106.html) than previous GPT-4 models.
+
+Aider now supports a [unified diffs editing format, which reduces GPT-4 Turbo's "lazy" coding](https://aider.chat/docs/unified-diffs.html)
 
 To use it, run aider like this:
 
 ```
-aider --model gpt-4-1106-preview
+aider --4-turbo
 ```
 
 ## Getting started

From dd389a42d1141fb88d26e4f96bb90e83d8cdb8b5 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 20:22:46 -0600
Subject: [PATCH 17/22] copy

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bddf622ca..6605f81b1 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ Benchmark results indicate that it is
 [very fast](https://aider.chat/docs/benchmarks-speed-1106.html),
 and a bit [better at coding](https://aider.chat/docs/benchmarks-1106.html) than previous GPT-4 models.
 
-Aider now supports a [unified diffs editing format, which reduces GPT-4 Turbo's "lazy" coding](https://aider.chat/docs/unified-diffs.html)
+Aider now supports a [unified diff editing format, which reduces GPT-4 Turbo's "lazy" coding](https://aider.chat/docs/unified-diffs.html).
 
 To use it, run aider like this:
 

From c5aba3bead798352d6726c082456a48e76049960 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 20:42:04 -0600
Subject: [PATCH 18/22] copy

---
 docs/unified-diffs.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index cfc1e5020..e5d2ca147 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -385,8 +385,9 @@ fine tuning models on
 aider's simple, high level style of unified diffs.
 Dropping line numbers from the hunk headers and focusing on diffs of
 semantically coherent chunks of code
-seems to be an important part of successful GPT code editing.
+seems to be an important part of successful GPT code editing
+(besides the relentless focus on flexibly applying edits).
 Most LLMs will have already seen plenty of unified diffs
 in their normal training data, and so should be
-very amenable to fining tuning towards this
+amenable to fining tuning towards this
 particular diff style.

From 8a3bde342f50f843e1cf445f89408d1495aca86b Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Tue, 19 Dec 2023 21:54:24 -0600
Subject: [PATCH 19/22] copy

---
 docs/unified-diffs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index e5d2ca147..3bd56835d 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -15,7 +15,7 @@ code with comments
 like "...add logic here...".
 
 Aider's new "laziness" benchmark suite 
-designed to both provoke and quantify lazy coding.
+is designed to both provoke and quantify lazy coding.
 It consists of
 89 python refactoring tasks
 which tend to make GPT-4 Turbo lazy

From 97fd65c6ceb48a90d829bd9b41ab84c10b42c504 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Wed, 20 Dec 2023 13:13:44 -0400
Subject: [PATCH 20/22] copy

---
 docs/unified-diffs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index 3bd56835d..a6ab10235 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -28,9 +28,9 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
 - **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.**
 
-These widely circulated "emotional appeal" folk remedies 
+Widely circulated "emotional appeal" folk remedies 
 produced worse benchmark scores.
-Adding *all* of these claims to the system prompt
+Adding *all* of the various emotional statements to the system prompt
 resulted in worse benchmark scores
 for the baseline SEARCH/REPLACE and new unified diff editing formats.
 These prompts did somewhat reduce the amount of laziness when used

From 7453624945e0b9b1a4e3eee9526279dceeba5f03 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Wed, 20 Dec 2023 14:29:57 -0400
Subject: [PATCH 21/22] copy

---
 docs/unified-diffs.md | 48 +++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 31 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index a6ab10235..b8c6f78ff 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -26,20 +26,11 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev
 
 - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks.
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
-- **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.**
-
-Widely circulated "emotional appeal" folk remedies 
+- **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** Widely circulated "emotional appeal" folk remedies 
 produced worse benchmark scores.
 Adding *all* of the various emotional statements to the system prompt
 resulted in worse benchmark scores
 for the baseline SEARCH/REPLACE and new unified diff editing formats.
-These prompts did somewhat reduce the amount of laziness when used
-with the SEARCH/REPLACE edit format,
-from 12 to 8 lazy tasks.
-They slightly increased the lazy tasks from 4 to 5 when added to the unified diff prompt,
-which means they had roughly no effect on this format.
-But again, they seem to harm the overall ability of GPT-4 Turbo to complete
-the benchmark's refactoring coding tasks.
 
 The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs:
 
@@ -49,7 +40,20 @@ The older `gpt-4-0613` also did better on the laziness benchmark using unified d
 28% of them are too large to fit in June GPT-4's 8k context window.
 This puts a hard ceiling of 72% on how well the June model could possibly score.
 
-Before settling on unified diffs,
+With unified diffs, GPT acts more like it's writing textual data intended to be read by a program,
+not talking to a person.
+They are
+usually
+consumed by the
+[patch](https://www.gnu.org/software/diffutils/manual/html_node/Merging-with-patch.html)
+program, which is fairly rigid.
+This seems to encourage rigor, making
+GPT less likely to
+leave informal editing instructions in comments
+or be lazy about writing all the needed code.
+
+Aider's new unified diff editing format
+outperforms other solutions I evaluated by a wide margin.
 I explored many other approaches including:
 prompts about being tireless and diligent,
 OpenAI's function/tool calling capabilities,
@@ -59,8 +63,6 @@ and other diff-like formats.
 The results shared here reflect
 an extensive investigation and benchmark evaluations of many approaches.
 
-Aider's new unified diff editing format
-outperforms other solutions by a wide margin.
 The rest of this article will describe
 aider's new editing format and refactoring benchmark.
 It will highlight some key design decisions,
@@ -82,7 +84,8 @@ A helpful shortcut here is to have empathy for GPT, and imagine you
 are the one being asked to specify code edits.
 Would you want to hand type a properly escaped json data structure
 to invoke surgical insert, delete, replace operations on specific code line numbers?
-How would you feel about any mistake causing all your work to be discarded?
+Do you want to use a brittle format, where any mistake
+causes and error and all your work to be discarded?
 
 GPT is quantitatively better at code editing when you reduce the
 burden of formatting edits by using a familiar, simple, high level
@@ -110,23 +113,6 @@ seen *many* examples in its training data.
 It's been trained to generate
 text that conforms to the unified diff syntax.
 
-Unified diffs are
-usually intended to be consumed by the
-[patch](https://www.gnu.org/software/diffutils/manual/html_node/Merging-with-patch.html)
-program.
-They need to *accurately* reflect the original and updated file contents,
-otherwise the patch command will fail.
-Having GPT specify changes in a format that is usually consumed by a
-rigid program like patch
-seems to encourage rigor.
-GPT is less likely to
-leave informal editing instructions in comments
-or be lazy about writing all the needed code.
-
-With unified diffs, GPT acts more like it's writing textual data intended to be read by a program,
-not talking to a person.
-
-
 ### Use a simple editing format
 
 Aider's [previous benchmark results](https://aider.chat/docs/benchmarks.html) made

From 208f9ef24a66532b30cc7e54fa43e74ec838a73f Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Wed, 20 Dec 2023 14:31:04 -0400
Subject: [PATCH 22/22] copy

---
 docs/unified-diffs.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md
index b8c6f78ff..be5b5b42e 100644
--- a/docs/unified-diffs.md
+++ b/docs/unified-diffs.md
@@ -27,10 +27,8 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev
 - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks.
 - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks.
 - **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** Widely circulated "emotional appeal" folk remedies 
-produced worse benchmark scores.
-Adding *all* of the various emotional statements to the system prompt
-resulted in worse benchmark scores
-for the baseline SEARCH/REPLACE and new unified diff editing formats.
+produced worse benchmark scores
+for both the baseline SEARCH/REPLACE and new unified diff editing formats.
 
 The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: