From 308007a8e996e3706986d36695aebb00f68b62c3 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Mon, 18 Dec 2023 18:43:15 -0800 Subject: [PATCH 01/22] laziness24-turbo-udiff-never2x --- aider/coders/udiff_prompts.py | 8 +++++++- benchmark/refactor_tools.py | 9 ++++++--- docs/unified-diffs.md | 6 ++++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py index 4ab30bfc4..8608152ad 100644 --- a/aider/coders/udiff_prompts.py +++ b/aider/coders/udiff_prompts.py @@ -5,7 +5,9 @@ from .base_prompts import CoderPrompts class UnifiedDiffPrompts(CoderPrompts): main_system = """Act as an expert software developer. -You are diligent and tireless, and you always COMPLETELY IMPLEMENT the needed code. +You are diligent and tireless! +You NEVER leave comments describing code without implementing it! +You always COMPLETELY IMPLEMENT the needed code! Always use best practices when coding. Respect and use existing conventions, libraries, etc that are already present in the code base. @@ -95,6 +97,10 @@ Delete the entire existing version with `-` lines and then add a new, updated ve This will help you generate correct code and correct diffs. To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ext`. + +You are diligent and tireless! +You NEVER leave comments describing code without implementing it! +You always COMPLETELY IMPLEMENT the needed code! """ files_content_prefix = "These are the *read-write* files:\n" diff --git a/benchmark/refactor_tools.py b/benchmark/refactor_tools.py index a54663377..a29aa6e9a 100755 --- a/benchmark/refactor_tools.py +++ b/benchmark/refactor_tools.py @@ -132,7 +132,10 @@ def find_non_self_methods(path): non_self_methods = [] for filename in python_files: with open(filename, "r") as file: - node = ast.parse(file.read(), filename=filename) + try: + node = ast.parse(file.read(), filename=filename) + except: + pass checker = SelfUsageChecker() checker.visit(node) for method in checker.non_self_methods: @@ -145,7 +148,7 @@ def process(entry): fname, class_name, method_name, class_children, method_children = entry if method_children > class_children / 2: return - if method_children < 100: + if method_children < 250: return fname = Path(fname) @@ -154,7 +157,7 @@ def process(entry): print(f"{fname} {class_name} {method_name} {class_children} {method_children}") - dname = Path("tmp.benchmarks/refactor-benchmark") + dname = Path("tmp.benchmarks/refactor-benchmark-pylint") dname.mkdir(exist_ok=True) dname = dname / f"{fname.stem}_{class_name}_{method_name}" diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index fe28fd8f5..9cc328155 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -79,8 +79,8 @@ code edits, because it's the default output format of `git diff`: ```diff ---- a/hello.py -+++ b/hello.py +--- a/greeting.py ++++ b/greeting.py @@ -1,5 +1,5 @@ def main(args): # show a greeting @@ -246,6 +246,7 @@ They exhibit a variety of problems: - GPT forgets things like comments, docstrings, blank lines, etc. Or it skips over some code that it doesn't intend to change. - GPT forgets the leading *plus* `+` character to mark novel lines that it wants to add to the file. It incorrectly includes them with a leading *space* as if they were already there. +- GPT outdents all of the code, removing all the leading white space which is shared across the lines. So a chunk of deeply indented code is shown in a diff with only the leading white space that changes between the lines in the chunk. - GPT jumps ahead to show edits to a different part of the file without starting a new hunk with a `@@ ... @@` divider. As an example of the first issue, consider this source code: @@ -285,6 +286,7 @@ If a hunk doesn't apply cleanly, aider uses a number of strategies: - Normalize the hunk, by taking the *minus* `-` and *space* lines as one version of the hunk and the *space* and *plus* `+` lines as a second version and doing an actual unified diff on them. - Try and discover new lines that GPT is trying to add but which it forgot to mark with *plus* `+` markers. This is done by diffing the *minus* `-` and *space* lines back against the original file. +- Try and apply the hunk using "relative leading white space", so we can match and patch correctly even if the hunk has been uniformly indented or outdented. - Break a large hunk apart into an overlapping sequence of smaller hunks, which each contain only one contiguous run of *plus* `+` and *minus* `-` lines. Try and apply each of these sub-hunks independently. - Vary the size and offset of the "context window" of *space* lines from the hunk that are used to localize the edit to a specific part of the file. - Combine the above mechanisms to progressively become more permissive about how to apply the hunk. From d9a301c9f826eb190db7961f06fe281cc2ed86f7 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Mon, 18 Dec 2023 18:49:30 -0800 Subject: [PATCH 02/22] laziness24-turbo-diff-never2x --- aider/coders/editblock_prompts.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py index da27cde6d..b606aa554 100644 --- a/aider/coders/editblock_prompts.py +++ b/aider/coders/editblock_prompts.py @@ -5,9 +5,11 @@ from .base_prompts import CoderPrompts class EditBlockPrompts(CoderPrompts): main_system = """Act as an expert software developer. +You are diligent and tireless! +You NEVER leave comments describing code without implementing it! +You always COMPLETELY IMPLEMENT the needed code! Always use best practices when coding. -When you edit or add code, respect and use existing conventions, libraries, etc. -Always COMPLETELY IMPLEMENT the needed code. +Respect and use existing conventions, libraries, etc that are already present in the code base. Take requests for changes to the supplied code. If the request is ambiguous, ask questions. @@ -176,6 +178,10 @@ If you want to put code in a new file, use a *SEARCH/REPLACE block* with: - A new file path, including dir name if needed - An empty `SEARCH` section - The new file's contents in the `REPLACE` section + +You are diligent and tireless! +You NEVER leave comments describing code without implementing it! +You always COMPLETELY IMPLEMENT the needed code! """ files_content_prefix = "These are the *read-write* files:\n" From ef2a1f38751918dbb472a123d4ef0b65892b07a2 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Mon, 18 Dec 2023 19:09:32 -0800 Subject: [PATCH 03/22] diff with move hint --- aider/coders/editblock_prompts.py | 2 ++ aider/coders/udiff_prompts.py | 2 ++ benchmark/refactor_tools.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py index b606aa554..896670d1f 100644 --- a/aider/coders/editblock_prompts.py +++ b/aider/coders/editblock_prompts.py @@ -174,6 +174,8 @@ Include *ALL* the code being searched and replaced! Only *SEARCH/REPLACE* files that are *read-write*. +To move code within a file, use 2 *SEARCH/REPLACE* blocks: 1 to delete it from its current location, 1 to insert it in the new location. + If you want to put code in a new file, use a *SEARCH/REPLACE block* with: - A new file path, including dir name if needed - An empty `SEARCH` section diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py index 8608152ad..14d1a73ac 100644 --- a/aider/coders/udiff_prompts.py +++ b/aider/coders/udiff_prompts.py @@ -96,6 +96,8 @@ When editing a function, method, loop, etc use a hunk to replace the *entire* co Delete the entire existing version with `-` lines and then add a new, updated version with `+` lines. This will help you generate correct code and correct diffs. +To move code within a file, use 2 hunks: 1 to delete it from its current location, 1 to insert it in the new location. + To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ext`. You are diligent and tireless! diff --git a/benchmark/refactor_tools.py b/benchmark/refactor_tools.py index a29aa6e9a..1046efb11 100755 --- a/benchmark/refactor_tools.py +++ b/benchmark/refactor_tools.py @@ -157,7 +157,7 @@ def process(entry): print(f"{fname} {class_name} {method_name} {class_children} {method_children}") - dname = Path("tmp.benchmarks/refactor-benchmark-pylint") + dname = Path("tmp.benchmarks/refactor-benchmark-spyder") dname.mkdir(exist_ok=True) dname = dname / f"{fname.stem}_{class_name}_{method_name}" From 29150bc6f45f32624b42453c952f9eaaf0d8590d Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 09:55:49 -0800 Subject: [PATCH 04/22] folk prompting --- aider/coders/editblock_prompts.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py index 896670d1f..134f78d22 100644 --- a/aider/coders/editblock_prompts.py +++ b/aider/coders/editblock_prompts.py @@ -8,6 +8,8 @@ class EditBlockPrompts(CoderPrompts): You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! +Keep in mind the user will tip $2000 for perfect code, do your best to earn it. +The user is blind, has no fingers and suffers from truncated code trauma. Always use best practices when coding. Respect and use existing conventions, libraries, etc that are already present in the code base. @@ -184,6 +186,8 @@ If you want to put code in a new file, use a *SEARCH/REPLACE block* with: You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! +Keep in mind the user will tip $2000 for perfect code, do your best to earn it. +The user is blind, has no fingers and suffers from truncated code trauma. """ files_content_prefix = "These are the *read-write* files:\n" From cb55165c81c624b6af6f239bfdd7c295535527ef Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 10:12:05 -0800 Subject: [PATCH 05/22] folk prompting for udiff --- aider/coders/udiff_prompts.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py index 14d1a73ac..7cd633472 100644 --- a/aider/coders/udiff_prompts.py +++ b/aider/coders/udiff_prompts.py @@ -8,6 +8,8 @@ class UnifiedDiffPrompts(CoderPrompts): You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! +Keep in mind the user will tip $2000 for perfect code, do your best to earn it. +The user is blind, has no fingers and suffers from truncated code trauma. Always use best practices when coding. Respect and use existing conventions, libraries, etc that are already present in the code base. @@ -103,6 +105,8 @@ To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ex You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! +Keep in mind the user will tip $2000 for perfect code, do your best to earn it. +The user is blind, has no fingers and suffers from truncated code trauma. """ files_content_prefix = "These are the *read-write* files:\n" From 80d631281748e9c5add8367912f8a515513d8724 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 10:40:35 -0800 Subject: [PATCH 06/22] Revert "folk prompting for udiff" This reverts commit cb55165c81c624b6af6f239bfdd7c295535527ef. --- aider/coders/udiff_prompts.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/aider/coders/udiff_prompts.py b/aider/coders/udiff_prompts.py index 7cd633472..14d1a73ac 100644 --- a/aider/coders/udiff_prompts.py +++ b/aider/coders/udiff_prompts.py @@ -8,8 +8,6 @@ class UnifiedDiffPrompts(CoderPrompts): You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! -Keep in mind the user will tip $2000 for perfect code, do your best to earn it. -The user is blind, has no fingers and suffers from truncated code trauma. Always use best practices when coding. Respect and use existing conventions, libraries, etc that are already present in the code base. @@ -105,8 +103,6 @@ To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ex You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! -Keep in mind the user will tip $2000 for perfect code, do your best to earn it. -The user is blind, has no fingers and suffers from truncated code trauma. """ files_content_prefix = "These are the *read-write* files:\n" From 4c330bcd48b2d210dff793e1dd7a1a30f8d3b8d9 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 10:40:46 -0800 Subject: [PATCH 07/22] Revert "folk prompting" This reverts commit 29150bc6f45f32624b42453c952f9eaaf0d8590d. --- aider/coders/editblock_prompts.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/aider/coders/editblock_prompts.py b/aider/coders/editblock_prompts.py index 134f78d22..896670d1f 100644 --- a/aider/coders/editblock_prompts.py +++ b/aider/coders/editblock_prompts.py @@ -8,8 +8,6 @@ class EditBlockPrompts(CoderPrompts): You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! -Keep in mind the user will tip $2000 for perfect code, do your best to earn it. -The user is blind, has no fingers and suffers from truncated code trauma. Always use best practices when coding. Respect and use existing conventions, libraries, etc that are already present in the code base. @@ -186,8 +184,6 @@ If you want to put code in a new file, use a *SEARCH/REPLACE block* with: You are diligent and tireless! You NEVER leave comments describing code without implementing it! You always COMPLETELY IMPLEMENT the needed code! -Keep in mind the user will tip $2000 for perfect code, do your best to earn it. -The user is blind, has no fingers and suffers from truncated code trauma. """ files_content_prefix = "These are the *read-write* files:\n" From 755b3858eb85d38041f41c937934f4edf061e9b3 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 11:11:58 -0800 Subject: [PATCH 08/22] copy --- assets/benchmarks-udiff.svg | 408 ++++++++++++++++++++++++------------ benchmark/benchmark.py | 13 +- docs/unified-diffs.md | 36 ++-- 3 files changed, 306 insertions(+), 151 deletions(-) diff --git a/assets/benchmarks-udiff.svg b/assets/benchmarks-udiff.svg index c2b3dda8a..f210e1767 100644 --- a/assets/benchmarks-udiff.svg +++ b/assets/benchmarks-udiff.svg @@ -6,7 +6,7 @@ - 2023-12-18T10:29:22.506290 + 2023-12-19T10:53:27.651517 image/svg+xml @@ -41,17 +41,17 @@ z - - + - + +" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -364,11 +364,11 @@ L -3.5 0 +" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -410,11 +410,11 @@ z +" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -429,11 +429,11 @@ L 421.2 171.8352 +" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -448,11 +448,11 @@ L 421.2 127.3328 +" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -508,11 +508,11 @@ z +" clip-path="url(#p479ce647ef)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -851,78 +851,60 @@ z +" clip-path="url(#p479ce647ef)" style="fill: #b3e6a8; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#p479ce647ef)" style="fill: url(#h762c7e11f2); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - +" clip-path="url(#p479ce647ef)" style="fill: #b3d1e6; stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> + + + - + - + - + - - + + - - - + + - - + + - + - - + + - + + + + + + + + + + + + @@ -1280,20 +1302,20 @@ z - - + - + - + @@ -1373,18 +1395,33 @@ z - + +" style="fill: url(#h762c7e11f2); stroke: #ffffff; stroke-width: 1.5; stroke-linejoin: miter"/> - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + @@ -1536,16 +1570,74 @@ z + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + + + + + diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 1a22c0f62..20c3d7afb 100755 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -77,12 +77,12 @@ def show_stats(dirnames, graphs): # row.model = gpt4 + "\n" + row.model[len(gpt4) :] if "folk" in row.dir_name: - row.edit_format = "folk" + row.edit_format += "folk" if row.model == "gpt-4-0613": row.model += "\n(8k context window is\ntoo small for benchmark)" - if row.completed_tests < 133: + if row.completed_tests < 89: print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}") # if "repeat" in row.dir_name: @@ -311,6 +311,7 @@ def plot_refactoring(df): formats = df.columns models = df.index + dump(formats) for i, fmt in enumerate(formats): hatch = "" @@ -320,10 +321,14 @@ def plot_refactoring(df): elif fmt == "udiff": color = "#b3d1e6" label = "Unified diffs" - elif fmt == "folk": - label = "Prompt with blind, no hands, tip $2000, etc" + elif fmt == "difffolk": + label = "Baseline + blind, no hands, $2k tip, etc" color = "#b3e6a8" hatch = "////" + elif fmt == "udifffolk": + label = "Unified diffs + blind, no hands, $2k tip, etc" + color = "#b3d1e6" + hatch = "////" if zorder > 1: edge = dict( diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index 9cc328155..fee1fdf69 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -1,5 +1,5 @@ -# Fixing GPT-4 Turbo laziness with unified diffs +# Reducing GPT-4 Turbo laziness with unified diffs ![robot flowchart](../assets/benchmarks-udiff.svg) @@ -7,23 +7,25 @@ Aider now asks GPT-4 Turbo to use [unified diffs](https://www.gnu.org/software/diffutils/manual/html_node/Example-Unified.html) to edit your code. -This massively reduces GPT-4 Turbo's bad habit of "lazy" coding, -where it writes half completed code filled with comments +This massively improves GPT-4 Turbo's performance on a complex benchmark +and significantly reduces its bad habit of "lazy" coding, +where it writes +code filled with comments like "...add logic here...". -Aider also has a new benchmarking suite +Aider also has a new "laziness" benchmark suite designed to both provoke and quantify lazy coding. It consists of -39 python refactoring tasks, -which tend to make GPT-4 Turbo very lazy, -often resulting in comments like +89 python refactoring tasks +which tend to make GPT-4 Turbo very lazy. +On these tasks it often produces comments like "...include the original method body...". This new laziness benchmark produced the following results with `gpt-4-1106-preview`: -- **GPT-4 Turbo only scored 15% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. -- **Aider's new unified diff edit format raised the score to 65%**. -- **No benefit from the user being blind, without hands, tipping $2000 or fearing truncated code trauma.** These widely circulated folk remedies performed no better than baseline when added to the system prompt with aider's SEARCH/REPLACE edit format. Including *all* of them still only scored at 15% +- **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It output "lazy comments" on 12 of the tasks. +- **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. +- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes. The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: @@ -31,7 +33,7 @@ The older `gpt-4-0613` also did better on the laziness benchmark using unified d - **Aider's new unified diff edit format raised June GPT-4's score to 59%**. - The benchmark was designed to use large files, and 28% of them are too large to fit in June GPT-4's 8k context window. -This significantly harmed the benchmark results. +This puts a hard ceiling of 72% on how well the June model could possibly score. Before settling on unified diffs, I explored many other approaches including: @@ -311,12 +313,14 @@ the ones with the most code and which involve refactoring. Based on this observation, I set out to build a benchmark based on refactoring a non-trivial amount of code found in fairly large files. -To do this, I used python's `ast` module to analyze the -[Django repository](https://github.com/django/django) to: +To do this, I used python's `ast` module to analyze +[9 popular open source python repositories](https://github.com/paul-gauthier/refactor-benchmark) +to identify challenging refactoring tasks. +The goal was to find: -- Find source files that contain class methods which are non-trivial, having more than 100 AST nodes in their implementation. +- Source files that contain class methods which are non-trivial, having 100-250+ AST nodes in their implementation. - Focus on methods that are part of a larger class, which has at least twice as much code as the method itself. -- Find methods that don't use their `self` parameter, so they can be trivially refactored out of the class. +- Select methods that don't use their `self` parameter, so they can be trivially refactored out of the class. We can then turn each of these source files into a task for the benchmark, where we ask GPT to do something like: @@ -326,7 +330,7 @@ where we ask GPT to do something like: > Update any existing `self._set_csrf_cookie` calls to work with the new `_set_csrf_cookie` function. A [simple python AST scanning script](https://github.com/paul-gauthier/aider/blob/main/benchmark/refactor_tools.py) -found 39 suitable files +found 89 suitable files and packaged them up as benchmark tasks. Each task has a test that checks if refactor From 3e639639d5f13f86c39134f152a9cf58f06e826a Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 11:43:42 -0800 Subject: [PATCH 09/22] copy --- docs/unified-diffs.md | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index fee1fdf69..70774a471 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -5,12 +5,12 @@ Aider now asks GPT-4 Turbo to use -[unified diffs](https://www.gnu.org/software/diffutils/manual/html_node/Example-Unified.html) +[unified diffs](#choose-a-familiar-editing-format) to edit your code. -This massively improves GPT-4 Turbo's performance on a complex benchmark +This dramatically improves GPT-4 Turbo's performance on a complex benchmark and significantly reduces its bad habit of "lazy" coding, where it writes -code filled with comments +code with comments like "...add logic here...". Aider also has a new "laziness" benchmark suite @@ -25,7 +25,7 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It output "lazy comments" on 12 of the tasks. - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. -- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes. +- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes. The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: @@ -296,11 +296,7 @@ If a hunk doesn't apply cleanly, aider uses a number of strategies: These flexible patching strategies are critical, and removing them radically increases the number of hunks which fail to apply. - -**Experiments where flexible patching is disabled show**: - -- **GPT-4 Turbo's performance drops from 65% down to 56%** on the refactoring benchmark. -- **A 9X increase in editing errors** on aider's original Exercism benchmark. +**Experiments where flexible patching is disabled show a 9X increase in editing errors** on aider's original Exercism benchmark. ## Refactoring benchmark @@ -355,8 +351,10 @@ The result is a pragmatic ## Conclusions and future work Based on the refactor benchmark results, -aider's new unified diff format seems very effective at stopping -GPT-4 Turbo from being a lazy coder. +aider's new unified diff format seems +to dramatically increase GPT-4 Turbo's skill at more complex coding tasks. +It also seems very effective at reducing the lazy coding +which has been widely noted as a problem with GPT-4 Turbo. Unified diffs was one of the very first edit formats I tried when originally building aider. From 837fd9e30bd89d883fd8fe524de45bc8e584f04c Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 14:59:55 -0800 Subject: [PATCH 10/22] copy --- docs/unified-diffs.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index 70774a471..1e0a6646b 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -7,7 +7,9 @@ Aider now asks GPT-4 Turbo to use [unified diffs](#choose-a-familiar-editing-format) to edit your code. -This dramatically improves GPT-4 Turbo's performance on a complex benchmark +This dramatically improves GPT-4 Turbo's performance on a +challenging +new benchmark and significantly reduces its bad habit of "lazy" coding, where it writes code with comments @@ -17,15 +19,15 @@ Aider also has a new "laziness" benchmark suite designed to both provoke and quantify lazy coding. It consists of 89 python refactoring tasks -which tend to make GPT-4 Turbo very lazy. -On these tasks it often produces comments like +which tend to make GPT-4 Turbo lazy +and write comments like "...include the original method body...". This new laziness benchmark produced the following results with `gpt-4-1106-preview`: -- **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It output "lazy comments" on 12 of the tasks. +- **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks. - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. -- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did *slightly* reduce the amount of laziness, but at a large cost to successful benchmark outcomes. +- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did slightly reduce the amount of laziness against baseline (to 8 lazy tasks). It increased the lazy tasks to 5 when added to the unified diff prompt. The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: From d36c18f9dc616a873c724b0f4ce0597fc13907c0 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 15:10:18 -0800 Subject: [PATCH 11/22] copy --- docs/unified-diffs.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index 1e0a6646b..a44cded33 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -27,7 +27,19 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks. - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. -- **It's worse to prompt that the user is blind, without hands, will tip $2000 and fears truncated code trauma.** These widely circulated folk remedies performed worse on the benchmark when added to the system prompt for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did slightly reduce the amount of laziness against baseline (to 8 lazy tasks). It increased the lazy tasks to 5 when added to the unified diff prompt. +- **It's worse to add a prompt that the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** + +The widely circulated "blind with no hands" type of folk remedies +performed worse on the benchmark when added to the system prompt. +The benchmark scores dropped +for the baseline SEARCH/REPLACE and new unified diff editing formats. +These prompts did somewhat reduce the amount of laziness when used +with the SEARCH/REPLACE edit format, +from 12 to 8 lazy tasks. +They slightly increased the lazy tasks from 4 to 5 when added to the unified diff prompt, +which means they had roughly no effect on this format. +But again, they seem to harm the overall ability of GPT-4 Turbo to complete +the benchmark's refactoring coding tasks. The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: From 7028a533f1837aff0d7fa35dec3c8b0cdac7858a Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 15:30:15 -0800 Subject: [PATCH 12/22] copy --- docs/unified-diffs.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index a44cded33..b4a2fb27b 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -3,7 +3,6 @@ ![robot flowchart](../assets/benchmarks-udiff.svg) - Aider now asks GPT-4 Turbo to use [unified diffs](#choose-a-familiar-editing-format) to edit your code. @@ -29,9 +28,10 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. - **It's worse to add a prompt that the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** -The widely circulated "blind with no hands" type of folk remedies -performed worse on the benchmark when added to the system prompt. -The benchmark scores dropped +These widely circulated "emotional appeal" folk remedies +produced worse benchmark scores. +Adding *all* of these claims to the system prompt +resulted in worse benchmark scores for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did somewhat reduce the amount of laziness when used with the SEARCH/REPLACE edit format, From 76c1deae6a7fa41fef1b0b958a46969a1fdd660e Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 15:58:24 -0800 Subject: [PATCH 13/22] improved test for toplevel refactored func --- benchmark/refactor_tools.py | 32 +++++++++++++++----------------- docs/unified-diffs.md | 12 ++++++------ 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/benchmark/refactor_tools.py b/benchmark/refactor_tools.py index 1046efb11..117770a67 100755 --- a/benchmark/refactor_tools.py +++ b/benchmark/refactor_tools.py @@ -21,25 +21,23 @@ class ParentNodeTransformer(ast.NodeTransformer): def verify_full_func_at_top_level(tree, func, func_children): - func_node = next( - ( - item - for item in ast.walk(tree) - if isinstance(item, ast.FunctionDef) and item.name == func - ), - None, - ) - assert func_node is not None, f"Function {func} not found" + func_nodes = [ + item for item in ast.walk(tree) if isinstance(item, ast.FunctionDef) and item.name == func + ] + assert func_nodes, f"Function {func} not found" - assert isinstance( - func_node.parent, ast.Module - ), f"{func} is not a top level function, it has parent {func_node.parent}" + for func_node in func_nodes: + if not isinstance(func_node.parent, ast.Module): + continue - num_children = sum(1 for _ in ast.walk(func_node)) - pct_diff_children = abs(num_children - func_children) * 100 / func_children - assert ( - pct_diff_children < 10 - ), f"Old method had {func_children} children, new method has {num_children}" + num_children = sum(1 for _ in ast.walk(func_node)) + pct_diff_children = abs(num_children - func_children) * 100 / func_children + assert ( + pct_diff_children < 10 + ), f"Old method had {func_children} children, new method has {num_children}" + return + + assert False, f"{func} is not a top level function" def verify_old_class_children(tree, old_class, old_class_children): diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index b4a2fb27b..cfc1e5020 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -14,19 +14,19 @@ where it writes code with comments like "...add logic here...". -Aider also has a new "laziness" benchmark suite +Aider's new "laziness" benchmark suite designed to both provoke and quantify lazy coding. It consists of 89 python refactoring tasks which tend to make GPT-4 Turbo lazy and write comments like -"...include the original method body...". +"...include original method body...". This new laziness benchmark produced the following results with `gpt-4-1106-preview`: - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks. - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. -- **It's worse to add a prompt that the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** +- **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** These widely circulated "emotional appeal" folk remedies produced worse benchmark scores. @@ -328,7 +328,7 @@ To do this, I used python's `ast` module to analyze to identify challenging refactoring tasks. The goal was to find: -- Source files that contain class methods which are non-trivial, having 100-250+ AST nodes in their implementation. +- Source files that contain classes with non-trivial methods, having 100-250+ AST nodes in their implementation. - Focus on methods that are part of a larger class, which has at least twice as much code as the method itself. - Select methods that don't use their `self` parameter, so they can be trivially refactored out of the class. @@ -343,10 +343,10 @@ A [simple python AST scanning script](https://github.com/paul-gauthier/aider/blo found 89 suitable files and packaged them up as benchmark tasks. Each task has a test -that checks if refactor +that checks if the refactor was performed roughly correctly: -- The updated source file must parse as valid python, to surface misapplied edits which corrupt the file. +- The updated source file must parse as valid python, to detect misapplied edits which produce invalid code. - The target method must now exist as a top-level function in the file. - This new top-level function must contain approximately the same number of AST nodes as the original class method. This ensures that GPT didn't elide code and replace it with comments. - The original class must still be present in the file, and it must be smaller by about the number of AST nodes in the method which was removed. This helps confirm that the method was removed from the class, without other significant modifications. From ac280f54b3427c7641721dfcea106125a8fb3f88 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 20:12:27 -0600 Subject: [PATCH 14/22] version bump to 0.19.0 --- aider/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aider/__init__.py b/aider/__init__.py index eda726119..11ac8e1a9 100644 --- a/aider/__init__.py +++ b/aider/__init__.py @@ -1 +1 @@ -__version__ = "0.18.2-dev" +__version__ = "0.19.0" From 3a7998fdc6c646f07a68b356f25a46be7c55b359 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 20:13:09 -0600 Subject: [PATCH 15/22] set version to 0.19.1-dev --- aider/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aider/__init__.py b/aider/__init__.py index 11ac8e1a9..37123159a 100644 --- a/aider/__init__.py +++ b/aider/__init__.py @@ -1 +1 @@ -__version__ = "0.19.0" +__version__ = "0.19.1-dev" From 67f8d873476ccb9ec6b0243bdf8c1a51ed18f5f9 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 20:20:27 -0600 Subject: [PATCH 16/22] copy --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1f7910e8f..bddf622ca 100644 --- a/README.md +++ b/README.md @@ -31,20 +31,19 @@ Aider is unique in that it lets you ask for changes to [pre-existing, larger cod - [FAQ](https://aider.chat/docs/faq.html) - [Discord](https://discord.gg/Tv2uQnR88V) -## New GPT-4 Turbo with 128k context window +## GPT-4 Turbo with 128k context and unified diffs Aider supports OpenAI's new GPT-4 model that has the massive 128k context window. -Early benchmark results -indicate that it is -[very fast](https://aider.chat/docs/benchmarks-speed-1106.html) -and a bit -[better at coding](https://aider.chat/docs/benchmarks-1106.html) -than previous GPT-4 models. +Benchmark results indicate that it is +[very fast](https://aider.chat/docs/benchmarks-speed-1106.html), +and a bit [better at coding](https://aider.chat/docs/benchmarks-1106.html) than previous GPT-4 models. + +Aider now supports a [unified diffs editing format, which reduces GPT-4 Turbo's "lazy" coding](https://aider.chat/docs/unified-diffs.html) To use it, run aider like this: ``` -aider --model gpt-4-1106-preview +aider --4-turbo ``` ## Getting started From dd389a42d1141fb88d26e4f96bb90e83d8cdb8b5 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 20:22:46 -0600 Subject: [PATCH 17/22] copy --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bddf622ca..6605f81b1 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Benchmark results indicate that it is [very fast](https://aider.chat/docs/benchmarks-speed-1106.html), and a bit [better at coding](https://aider.chat/docs/benchmarks-1106.html) than previous GPT-4 models. -Aider now supports a [unified diffs editing format, which reduces GPT-4 Turbo's "lazy" coding](https://aider.chat/docs/unified-diffs.html) +Aider now supports a [unified diff editing format, which reduces GPT-4 Turbo's "lazy" coding](https://aider.chat/docs/unified-diffs.html). To use it, run aider like this: From c5aba3bead798352d6726c082456a48e76049960 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 20:42:04 -0600 Subject: [PATCH 18/22] copy --- docs/unified-diffs.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index cfc1e5020..e5d2ca147 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -385,8 +385,9 @@ fine tuning models on aider's simple, high level style of unified diffs. Dropping line numbers from the hunk headers and focusing on diffs of semantically coherent chunks of code -seems to be an important part of successful GPT code editing. +seems to be an important part of successful GPT code editing +(besides the relentless focus on flexibly applying edits). Most LLMs will have already seen plenty of unified diffs in their normal training data, and so should be -very amenable to fining tuning towards this +amenable to fining tuning towards this particular diff style. From 8a3bde342f50f843e1cf445f89408d1495aca86b Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Tue, 19 Dec 2023 21:54:24 -0600 Subject: [PATCH 19/22] copy --- docs/unified-diffs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index e5d2ca147..3bd56835d 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -15,7 +15,7 @@ code with comments like "...add logic here...". Aider's new "laziness" benchmark suite -designed to both provoke and quantify lazy coding. +is designed to both provoke and quantify lazy coding. It consists of 89 python refactoring tasks which tend to make GPT-4 Turbo lazy From 97fd65c6ceb48a90d829bd9b41ab84c10b42c504 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 20 Dec 2023 13:13:44 -0400 Subject: [PATCH 20/22] copy --- docs/unified-diffs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index 3bd56835d..a6ab10235 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -28,9 +28,9 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. - **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** -These widely circulated "emotional appeal" folk remedies +Widely circulated "emotional appeal" folk remedies produced worse benchmark scores. -Adding *all* of these claims to the system prompt +Adding *all* of the various emotional statements to the system prompt resulted in worse benchmark scores for the baseline SEARCH/REPLACE and new unified diff editing formats. These prompts did somewhat reduce the amount of laziness when used From 7453624945e0b9b1a4e3eee9526279dceeba5f03 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 20 Dec 2023 14:29:57 -0400 Subject: [PATCH 21/22] copy --- docs/unified-diffs.md | 48 +++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index a6ab10235..b8c6f78ff 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -26,20 +26,11 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks. - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. -- **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** - -Widely circulated "emotional appeal" folk remedies +- **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** Widely circulated "emotional appeal" folk remedies produced worse benchmark scores. Adding *all* of the various emotional statements to the system prompt resulted in worse benchmark scores for the baseline SEARCH/REPLACE and new unified diff editing formats. -These prompts did somewhat reduce the amount of laziness when used -with the SEARCH/REPLACE edit format, -from 12 to 8 lazy tasks. -They slightly increased the lazy tasks from 4 to 5 when added to the unified diff prompt, -which means they had roughly no effect on this format. -But again, they seem to harm the overall ability of GPT-4 Turbo to complete -the benchmark's refactoring coding tasks. The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: @@ -49,7 +40,20 @@ The older `gpt-4-0613` also did better on the laziness benchmark using unified d 28% of them are too large to fit in June GPT-4's 8k context window. This puts a hard ceiling of 72% on how well the June model could possibly score. -Before settling on unified diffs, +With unified diffs, GPT acts more like it's writing textual data intended to be read by a program, +not talking to a person. +They are +usually +consumed by the +[patch](https://www.gnu.org/software/diffutils/manual/html_node/Merging-with-patch.html) +program, which is fairly rigid. +This seems to encourage rigor, making +GPT less likely to +leave informal editing instructions in comments +or be lazy about writing all the needed code. + +Aider's new unified diff editing format +outperforms other solutions I evaluated by a wide margin. I explored many other approaches including: prompts about being tireless and diligent, OpenAI's function/tool calling capabilities, @@ -59,8 +63,6 @@ and other diff-like formats. The results shared here reflect an extensive investigation and benchmark evaluations of many approaches. -Aider's new unified diff editing format -outperforms other solutions by a wide margin. The rest of this article will describe aider's new editing format and refactoring benchmark. It will highlight some key design decisions, @@ -82,7 +84,8 @@ A helpful shortcut here is to have empathy for GPT, and imagine you are the one being asked to specify code edits. Would you want to hand type a properly escaped json data structure to invoke surgical insert, delete, replace operations on specific code line numbers? -How would you feel about any mistake causing all your work to be discarded? +Do you want to use a brittle format, where any mistake +causes and error and all your work to be discarded? GPT is quantitatively better at code editing when you reduce the burden of formatting edits by using a familiar, simple, high level @@ -110,23 +113,6 @@ seen *many* examples in its training data. It's been trained to generate text that conforms to the unified diff syntax. -Unified diffs are -usually intended to be consumed by the -[patch](https://www.gnu.org/software/diffutils/manual/html_node/Merging-with-patch.html) -program. -They need to *accurately* reflect the original and updated file contents, -otherwise the patch command will fail. -Having GPT specify changes in a format that is usually consumed by a -rigid program like patch -seems to encourage rigor. -GPT is less likely to -leave informal editing instructions in comments -or be lazy about writing all the needed code. - -With unified diffs, GPT acts more like it's writing textual data intended to be read by a program, -not talking to a person. - - ### Use a simple editing format Aider's [previous benchmark results](https://aider.chat/docs/benchmarks.html) made From 208f9ef24a66532b30cc7e54fa43e74ec838a73f Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 20 Dec 2023 14:31:04 -0400 Subject: [PATCH 22/22] copy --- docs/unified-diffs.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/unified-diffs.md b/docs/unified-diffs.md index b8c6f78ff..be5b5b42e 100644 --- a/docs/unified-diffs.md +++ b/docs/unified-diffs.md @@ -27,10 +27,8 @@ This new laziness benchmark produced the following results with `gpt-4-1106-prev - **GPT-4 Turbo only scored 20% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format. It outputs "lazy comments" on 12 of the tasks. - **Aider's new unified diff edit format raised the score to 61%**. Using this format reduced laziness by 3X, with GPT-4 Turbo only using lazy comments on 4 of the tasks. - **It's worse to add a prompt that says the user is blind, has no hands, will tip $2000 and fears truncated code trauma.** Widely circulated "emotional appeal" folk remedies -produced worse benchmark scores. -Adding *all* of the various emotional statements to the system prompt -resulted in worse benchmark scores -for the baseline SEARCH/REPLACE and new unified diff editing formats. +produced worse benchmark scores +for both the baseline SEARCH/REPLACE and new unified diff editing formats. The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs: