mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-24 22:34:59 +00:00
merging from upstream main
This commit is contained in:
commit
179b648864
29 changed files with 3810 additions and 114 deletions
|
@ -1,5 +1,9 @@
|
||||||
# Release history
|
# Release history
|
||||||
|
|
||||||
|
### v0.18.1
|
||||||
|
|
||||||
|
- Upgraded to new openai python client v1.3.7.
|
||||||
|
|
||||||
### v0.18.0
|
### v0.18.0
|
||||||
|
|
||||||
- Improved prompting for both GPT-4 and GPT-4 Turbo.
|
- Improved prompting for both GPT-4 and GPT-4 Turbo.
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
__version__ = "0.18.1-dev"
|
__version__ = "0.18.2-dev"
|
||||||
|
|
|
@ -2,6 +2,7 @@ from .base_coder import Coder
|
||||||
from .editblock_coder import EditBlockCoder
|
from .editblock_coder import EditBlockCoder
|
||||||
from .editblock_func_coder import EditBlockFunctionCoder
|
from .editblock_func_coder import EditBlockFunctionCoder
|
||||||
from .single_wholefile_func_coder import SingleWholeFileFunctionCoder
|
from .single_wholefile_func_coder import SingleWholeFileFunctionCoder
|
||||||
|
from .udiff_coder import UnifiedDiffCoder
|
||||||
from .wholefile_coder import WholeFileCoder
|
from .wholefile_coder import WholeFileCoder
|
||||||
from .wholefile_func_coder import WholeFileFunctionCoder
|
from .wholefile_func_coder import WholeFileFunctionCoder
|
||||||
|
|
||||||
|
@ -12,4 +13,5 @@ __all__ = [
|
||||||
WholeFileFunctionCoder,
|
WholeFileFunctionCoder,
|
||||||
EditBlockFunctionCoder,
|
EditBlockFunctionCoder,
|
||||||
SingleWholeFileFunctionCoder,
|
SingleWholeFileFunctionCoder,
|
||||||
|
UnifiedDiffCoder,
|
||||||
]
|
]
|
||||||
|
|
|
@ -50,7 +50,10 @@ class Coder:
|
||||||
functions = None
|
functions = None
|
||||||
total_cost = 0.0
|
total_cost = 0.0
|
||||||
num_exhausted_context_windows = 0
|
num_exhausted_context_windows = 0
|
||||||
|
num_malformed_responses = 0
|
||||||
last_keyboard_interrupt = None
|
last_keyboard_interrupt = None
|
||||||
|
max_apply_update_errors = 3
|
||||||
|
edit_format = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(
|
def create(
|
||||||
|
@ -62,7 +65,7 @@ class Coder:
|
||||||
skip_model_availabily_check=False,
|
skip_model_availabily_check=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
from . import EditBlockCoder, WholeFileCoder
|
from . import EditBlockCoder, UnifiedDiffCoder, WholeFileCoder
|
||||||
|
|
||||||
if not main_model:
|
if not main_model:
|
||||||
main_model = models.GPT4
|
main_model = models.GPT4
|
||||||
|
@ -84,6 +87,8 @@ class Coder:
|
||||||
return EditBlockCoder(client, main_model, io, **kwargs)
|
return EditBlockCoder(client, main_model, io, **kwargs)
|
||||||
elif edit_format == "whole":
|
elif edit_format == "whole":
|
||||||
return WholeFileCoder(client, main_model, io, **kwargs)
|
return WholeFileCoder(client, main_model, io, **kwargs)
|
||||||
|
elif edit_format == "udiff":
|
||||||
|
return UnifiedDiffCoder(client, main_model, io, **kwargs)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown edit format {edit_format}")
|
raise ValueError(f"Unknown edit format {edit_format}")
|
||||||
|
|
||||||
|
@ -146,7 +151,7 @@ class Coder:
|
||||||
|
|
||||||
self.main_model = main_model
|
self.main_model = main_model
|
||||||
|
|
||||||
self.io.tool_output(f"Model: {main_model.name}")
|
self.io.tool_output(f"Model: {main_model.name} using {self.edit_format} edit format")
|
||||||
|
|
||||||
self.show_diffs = show_diffs
|
self.show_diffs = show_diffs
|
||||||
|
|
||||||
|
@ -175,7 +180,8 @@ class Coder:
|
||||||
|
|
||||||
if self.repo:
|
if self.repo:
|
||||||
rel_repo_dir = self.repo.get_rel_repo_dir()
|
rel_repo_dir = self.repo.get_rel_repo_dir()
|
||||||
self.io.tool_output(f"Git repo: {rel_repo_dir}")
|
num_files = len(self.repo.get_tracked_files())
|
||||||
|
self.io.tool_output(f"Git repo: {rel_repo_dir} with {num_files} files")
|
||||||
else:
|
else:
|
||||||
self.io.tool_output("Git repo: none")
|
self.io.tool_output("Git repo: none")
|
||||||
self.find_common_root()
|
self.find_common_root()
|
||||||
|
@ -298,7 +304,13 @@ class Coder:
|
||||||
prompt += "\n"
|
prompt += "\n"
|
||||||
prompt += relative_fname
|
prompt += relative_fname
|
||||||
prompt += f"\n{self.fence[0]}\n"
|
prompt += f"\n{self.fence[0]}\n"
|
||||||
|
|
||||||
prompt += content
|
prompt += content
|
||||||
|
|
||||||
|
# lines = content.splitlines(keepends=True)
|
||||||
|
# lines = [f"{i+1:03}:{line}" for i, line in enumerate(lines)]
|
||||||
|
# prompt += "".join(lines)
|
||||||
|
|
||||||
prompt += f"{self.fence[1]}\n"
|
prompt += f"{self.fence[1]}\n"
|
||||||
|
|
||||||
return prompt
|
return prompt
|
||||||
|
@ -376,7 +388,7 @@ class Coder:
|
||||||
new_user_message = self.send_new_user_message(new_user_message)
|
new_user_message = self.send_new_user_message(new_user_message)
|
||||||
|
|
||||||
if with_message:
|
if with_message:
|
||||||
return
|
return self.partial_response_content
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
self.keyboard_interrupt()
|
self.keyboard_interrupt()
|
||||||
|
@ -488,12 +500,12 @@ class Coder:
|
||||||
# add the reminder anyway
|
# add the reminder anyway
|
||||||
total_tokens = 0
|
total_tokens = 0
|
||||||
|
|
||||||
|
messages += self.cur_messages
|
||||||
|
|
||||||
# Add the reminder prompt if we still have room to include it.
|
# Add the reminder prompt if we still have room to include it.
|
||||||
if total_tokens < self.main_model.max_context_tokens:
|
if total_tokens < self.main_model.max_context_tokens:
|
||||||
messages += reminder_message
|
messages += reminder_message
|
||||||
|
|
||||||
messages += self.cur_messages
|
|
||||||
|
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
def send_new_user_message(self, inp):
|
def send_new_user_message(self, inp):
|
||||||
|
@ -882,19 +894,19 @@ class Coder:
|
||||||
return set(edit[0] for edit in edits)
|
return set(edit[0] for edit in edits)
|
||||||
|
|
||||||
def apply_updates(self):
|
def apply_updates(self):
|
||||||
max_apply_update_errors = 3
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
edited = self.update_files()
|
edited = self.update_files()
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
|
self.num_malformed_responses += 1
|
||||||
err = err.args[0]
|
err = err.args[0]
|
||||||
self.apply_update_errors += 1
|
self.apply_update_errors += 1
|
||||||
if self.apply_update_errors < max_apply_update_errors:
|
if self.apply_update_errors < self.max_apply_update_errors:
|
||||||
self.io.tool_error(f"Malformed response #{self.apply_update_errors}, retrying...")
|
self.io.tool_error(f"Malformed response #{self.apply_update_errors}, retrying...")
|
||||||
self.io.tool_error(str(err))
|
self.io.tool_error(str(err))
|
||||||
return None, err
|
return None, err
|
||||||
else:
|
else:
|
||||||
self.io.tool_error(f"Malformed response #{self.apply_update_errors}, aborting.")
|
self.io.tool_error(f"Malformed response #{self.apply_update_errors}, aborting.")
|
||||||
|
self.io.tool_error(str(err))
|
||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
@ -902,11 +914,13 @@ class Coder:
|
||||||
print()
|
print()
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
self.apply_update_errors += 1
|
self.apply_update_errors += 1
|
||||||
if self.apply_update_errors < max_apply_update_errors:
|
if self.apply_update_errors < self.max_apply_update_errors:
|
||||||
self.io.tool_error(f"Update exception #{self.apply_update_errors}, retrying...")
|
self.io.tool_error(f"Update exception #{self.apply_update_errors}, retrying...")
|
||||||
|
self.io.tool_error(str(err))
|
||||||
return None, str(err)
|
return None, str(err)
|
||||||
else:
|
else:
|
||||||
self.io.tool_error(f"Update exception #{self.apply_update_errors}, aborting")
|
self.io.tool_error(f"Update exception #{self.apply_update_errors}, aborting")
|
||||||
|
self.io.tool_error(str(err))
|
||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
self.apply_update_errors = 0
|
self.apply_update_errors = 0
|
||||||
|
|
|
@ -9,6 +9,8 @@ from .editblock_prompts import EditBlockPrompts
|
||||||
|
|
||||||
|
|
||||||
class EditBlockCoder(Coder):
|
class EditBlockCoder(Coder):
|
||||||
|
edit_format = "diff"
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.gpt_prompts = EditBlockPrompts()
|
self.gpt_prompts = EditBlockPrompts()
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
769
aider/coders/search_replace.py
Executable file
769
aider/coders/search_replace.py
Executable file
|
@ -0,0 +1,769 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import git
|
||||||
|
from diff_match_patch import diff_match_patch
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from aider.dump import dump
|
||||||
|
from aider.utils import GitTemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
|
class RelativeIndenter:
|
||||||
|
"""Rewrites text files to have relative indentation, which involves
|
||||||
|
reformatting the leading white space on lines. This format makes
|
||||||
|
it easier to search and apply edits to pairs of code blocks which
|
||||||
|
may differ significantly in their overall level of indentation.
|
||||||
|
|
||||||
|
It removes leading white space which is shared with the preceding
|
||||||
|
line.
|
||||||
|
|
||||||
|
Original:
|
||||||
|
```
|
||||||
|
Foo # indented 8
|
||||||
|
Bar # indented 4 more than the previous line
|
||||||
|
Baz # same indent as the previous line
|
||||||
|
Fob # same indent as the previous line
|
||||||
|
```
|
||||||
|
|
||||||
|
Becomes:
|
||||||
|
```
|
||||||
|
Foo # indented 8
|
||||||
|
Bar # indented 4 more than the previous line
|
||||||
|
Baz # same indent as the previous line
|
||||||
|
Fob # same indent as the previous line
|
||||||
|
```
|
||||||
|
|
||||||
|
If the current line is *less* indented then the previous line,
|
||||||
|
uses a unicode character to indicate outdenting.
|
||||||
|
|
||||||
|
Original
|
||||||
|
```
|
||||||
|
Foo
|
||||||
|
Bar
|
||||||
|
Baz
|
||||||
|
Fob # indented 4 less than the previous line
|
||||||
|
```
|
||||||
|
|
||||||
|
Becomes:
|
||||||
|
```
|
||||||
|
Foo
|
||||||
|
Bar
|
||||||
|
Baz
|
||||||
|
←←←←Fob # indented 4 less than the previous line
|
||||||
|
```
|
||||||
|
|
||||||
|
This is a similar original to the last one, but every line has
|
||||||
|
been uniformly outdented:
|
||||||
|
```
|
||||||
|
Foo
|
||||||
|
Bar
|
||||||
|
Baz
|
||||||
|
Fob # indented 4 less than the previous line
|
||||||
|
```
|
||||||
|
|
||||||
|
It becomes this result, which is very similar to the previous
|
||||||
|
result. Only the white space on the first line differs. From the
|
||||||
|
word Foo onwards, it is identical to the previous result.
|
||||||
|
```
|
||||||
|
Foo
|
||||||
|
Bar
|
||||||
|
Baz
|
||||||
|
←←←←Fob # indented 4 less than the previous line
|
||||||
|
```
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, texts):
|
||||||
|
"""
|
||||||
|
Based on the texts, choose a unicode character that isn't in any of them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
chars = set()
|
||||||
|
for text in texts:
|
||||||
|
chars.update(text)
|
||||||
|
|
||||||
|
ARROW = "←"
|
||||||
|
if ARROW not in chars:
|
||||||
|
self.marker = ARROW
|
||||||
|
else:
|
||||||
|
self.marker = self.select_unique_marker(chars)
|
||||||
|
|
||||||
|
def select_unique_marker(self, chars):
|
||||||
|
for codepoint in range(0x10FFFF, 0x10000, -1):
|
||||||
|
marker = chr(codepoint)
|
||||||
|
if marker not in chars:
|
||||||
|
return marker
|
||||||
|
|
||||||
|
raise ValueError("Could not find a unique marker")
|
||||||
|
|
||||||
|
def make_relative(self, text):
|
||||||
|
"""
|
||||||
|
Transform text to use relative indents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.marker in text:
|
||||||
|
raise ValueError("Text already contains the outdent marker: {self.marker}")
|
||||||
|
|
||||||
|
lines = text.splitlines(keepends=True)
|
||||||
|
|
||||||
|
output = []
|
||||||
|
prev_indent = ""
|
||||||
|
for line in lines:
|
||||||
|
line_without_end = line.rstrip("\n\r")
|
||||||
|
|
||||||
|
len_indent = len(line_without_end) - len(line_without_end.lstrip())
|
||||||
|
indent = line[:len_indent]
|
||||||
|
change = len_indent - len(prev_indent)
|
||||||
|
if change > 0:
|
||||||
|
cur_indent = indent[-change:]
|
||||||
|
elif change < 0:
|
||||||
|
cur_indent = self.marker * -change
|
||||||
|
else:
|
||||||
|
cur_indent = ""
|
||||||
|
|
||||||
|
out_line = cur_indent + "\n" + line[len_indent:]
|
||||||
|
# dump(len_indent, change, out_line)
|
||||||
|
# print(out_line)
|
||||||
|
output.append(out_line)
|
||||||
|
prev_indent = indent
|
||||||
|
|
||||||
|
res = "".join(output)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def make_absolute(self, text):
|
||||||
|
"""
|
||||||
|
Transform text from relative back to absolute indents.
|
||||||
|
"""
|
||||||
|
lines = text.splitlines(keepends=True)
|
||||||
|
|
||||||
|
output = []
|
||||||
|
prev_indent = ""
|
||||||
|
for i in range(0, len(lines), 2):
|
||||||
|
dent = lines[i].rstrip("\r\n")
|
||||||
|
non_indent = lines[i + 1]
|
||||||
|
|
||||||
|
if dent.startswith(self.marker):
|
||||||
|
len_outdent = len(dent)
|
||||||
|
cur_indent = prev_indent[:-len_outdent]
|
||||||
|
else:
|
||||||
|
cur_indent = prev_indent + dent
|
||||||
|
|
||||||
|
if not non_indent.rstrip("\r\n"):
|
||||||
|
out_line = non_indent # don't indent a blank line
|
||||||
|
else:
|
||||||
|
out_line = cur_indent + non_indent
|
||||||
|
|
||||||
|
output.append(out_line)
|
||||||
|
prev_indent = cur_indent
|
||||||
|
|
||||||
|
res = "".join(output)
|
||||||
|
if self.marker in res:
|
||||||
|
# dump(res)
|
||||||
|
raise ValueError("Error transforming text back to absolute indents")
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# The patches are created to change S->R.
|
||||||
|
# So all the patch offsets are relative to S.
|
||||||
|
# But O has a lot more content. So all the offsets are very wrong.
|
||||||
|
#
|
||||||
|
# But patch_apply() seems to imply that once patch N is located,
|
||||||
|
# then it adjusts the offset of the next patch.
|
||||||
|
#
|
||||||
|
# This is great, because once we sync up after a big gap the nearby
|
||||||
|
# patches are close to being located right.
|
||||||
|
# Except when indentation has been changed by GPT.
|
||||||
|
#
|
||||||
|
# It would help to use the diff trick to build map_S_offset_to_O_offset().
|
||||||
|
# Then update all the S offsets in the S->R patches to be O offsets.
|
||||||
|
# Do we also need to update the R offsets?
|
||||||
|
#
|
||||||
|
# What if this gets funky/wrong?
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def map_patches(texts, patches, debug):
|
||||||
|
search_text, replace_text, original_text = texts
|
||||||
|
|
||||||
|
dmp = diff_match_patch()
|
||||||
|
dmp.Diff_Timeout = 5
|
||||||
|
|
||||||
|
diff_s_o = dmp.diff_main(search_text, original_text)
|
||||||
|
# diff_r_s = dmp.diff_main(replace_text, search_text)
|
||||||
|
|
||||||
|
# dmp.diff_cleanupSemantic(diff_s_o)
|
||||||
|
# dmp.diff_cleanupEfficiency(diff_s_o)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
html = dmp.diff_prettyHtml(diff_s_o)
|
||||||
|
Path("tmp.html").write_text(html)
|
||||||
|
|
||||||
|
dump(len(search_text))
|
||||||
|
dump(len(original_text))
|
||||||
|
|
||||||
|
for patch in patches:
|
||||||
|
start1 = patch.start1
|
||||||
|
start2 = patch.start2
|
||||||
|
|
||||||
|
patch.start1 = dmp.diff_xIndex(diff_s_o, start1)
|
||||||
|
patch.start2 = dmp.diff_xIndex(diff_s_o, start2)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
print()
|
||||||
|
print(start1, repr(search_text[start1 : start1 + 50]))
|
||||||
|
print(patch.start1, repr(original_text[patch.start1 : patch.start1 + 50]))
|
||||||
|
print(patch.diffs)
|
||||||
|
print()
|
||||||
|
|
||||||
|
return patches
|
||||||
|
|
||||||
|
|
||||||
|
example = """Left
|
||||||
|
Left
|
||||||
|
4 in
|
||||||
|
4 in
|
||||||
|
8 in
|
||||||
|
4 in
|
||||||
|
Left
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
ri = RelativeIndenter([example])
|
||||||
|
dump(example)
|
||||||
|
|
||||||
|
rel_example = ri.make_relative(example)
|
||||||
|
dump(repr(rel_example))
|
||||||
|
|
||||||
|
abs_example = ri.make_absolute(rel_example)
|
||||||
|
dump(abs_example)
|
||||||
|
|
||||||
|
|
||||||
|
sys.exit()
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def relative_indent(texts):
|
||||||
|
ri = RelativeIndenter(texts)
|
||||||
|
texts = list(map(ri.make_relative, texts))
|
||||||
|
|
||||||
|
return ri, texts
|
||||||
|
|
||||||
|
|
||||||
|
line_padding = 100
|
||||||
|
|
||||||
|
|
||||||
|
def line_pad(text):
|
||||||
|
padding = "\n" * line_padding
|
||||||
|
return padding + text + padding
|
||||||
|
|
||||||
|
|
||||||
|
def line_unpad(text):
|
||||||
|
if set(text[:line_padding] + text[-line_padding:]) != set("\n"):
|
||||||
|
return
|
||||||
|
return text[line_padding:-line_padding]
|
||||||
|
|
||||||
|
|
||||||
|
def dmp_apply(texts, remap=True):
|
||||||
|
debug = False
|
||||||
|
# debug = True
|
||||||
|
|
||||||
|
search_text, replace_text, original_text = texts
|
||||||
|
|
||||||
|
dmp = diff_match_patch()
|
||||||
|
dmp.Diff_Timeout = 5
|
||||||
|
# dmp.Diff_EditCost = 16
|
||||||
|
|
||||||
|
if remap:
|
||||||
|
dmp.Match_Threshold = 0.95
|
||||||
|
dmp.Match_Distance = 500
|
||||||
|
dmp.Match_MaxBits = 128
|
||||||
|
dmp.Patch_Margin = 32
|
||||||
|
else:
|
||||||
|
dmp.Match_Threshold = 0.5
|
||||||
|
dmp.Match_Distance = 100_000
|
||||||
|
dmp.Match_MaxBits = 32
|
||||||
|
dmp.Patch_Margin = 8
|
||||||
|
|
||||||
|
diff = dmp.diff_main(search_text, replace_text, None)
|
||||||
|
dmp.diff_cleanupSemantic(diff)
|
||||||
|
dmp.diff_cleanupEfficiency(diff)
|
||||||
|
|
||||||
|
patches = dmp.patch_make(search_text, diff)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
html = dmp.diff_prettyHtml(diff)
|
||||||
|
Path("tmp.search_replace_diff.html").write_text(html)
|
||||||
|
|
||||||
|
for d in diff:
|
||||||
|
print(d[0], repr(d[1]))
|
||||||
|
|
||||||
|
for patch in patches:
|
||||||
|
start1 = patch.start1
|
||||||
|
print()
|
||||||
|
print(start1, repr(search_text[start1 : start1 + 10]))
|
||||||
|
print(start1, repr(replace_text[start1 : start1 + 10]))
|
||||||
|
print(patch.diffs)
|
||||||
|
|
||||||
|
# dump(original_text)
|
||||||
|
# dump(search_text)
|
||||||
|
|
||||||
|
if remap:
|
||||||
|
patches = map_patches(texts, patches, debug)
|
||||||
|
|
||||||
|
patches_text = dmp.patch_toText(patches)
|
||||||
|
|
||||||
|
new_text, success = dmp.patch_apply(patches, original_text)
|
||||||
|
|
||||||
|
all_success = False not in success
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
# dump(new_text)
|
||||||
|
print(patches_text)
|
||||||
|
|
||||||
|
# print(new_text)
|
||||||
|
dump(success)
|
||||||
|
dump(all_success)
|
||||||
|
|
||||||
|
# print(new_text)
|
||||||
|
|
||||||
|
if not all_success:
|
||||||
|
return
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def lines_to_chars(lines, mapping):
|
||||||
|
new_text = []
|
||||||
|
for char in lines:
|
||||||
|
new_text.append(mapping[ord(char)])
|
||||||
|
|
||||||
|
new_text = "".join(new_text)
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def dmp_lines_apply(texts, remap=True):
|
||||||
|
debug = False
|
||||||
|
# debug = True
|
||||||
|
|
||||||
|
for t in texts:
|
||||||
|
assert t.endswith("\n"), t
|
||||||
|
|
||||||
|
search_text, replace_text, original_text = texts
|
||||||
|
|
||||||
|
dmp = diff_match_patch()
|
||||||
|
dmp.Diff_Timeout = 5
|
||||||
|
# dmp.Diff_EditCost = 16
|
||||||
|
|
||||||
|
dmp.Match_Threshold = 0.1
|
||||||
|
dmp.Match_Distance = 100_000
|
||||||
|
dmp.Match_MaxBits = 32
|
||||||
|
dmp.Patch_Margin = 1
|
||||||
|
|
||||||
|
all_text = search_text + replace_text + original_text
|
||||||
|
all_lines, _, mapping = dmp.diff_linesToChars(all_text, "")
|
||||||
|
assert len(all_lines) == len(all_text.splitlines())
|
||||||
|
|
||||||
|
search_num = len(search_text.splitlines())
|
||||||
|
replace_num = len(replace_text.splitlines())
|
||||||
|
original_num = len(original_text.splitlines())
|
||||||
|
|
||||||
|
search_lines = all_lines[:search_num]
|
||||||
|
replace_lines = all_lines[search_num : search_num + replace_num]
|
||||||
|
original_lines = all_lines[search_num + replace_num :]
|
||||||
|
|
||||||
|
assert len(search_lines) == search_num
|
||||||
|
assert len(replace_lines) == replace_num
|
||||||
|
assert len(original_lines) == original_num
|
||||||
|
|
||||||
|
diff_lines = dmp.diff_main(search_lines, replace_lines, None)
|
||||||
|
dmp.diff_cleanupSemantic(diff_lines)
|
||||||
|
dmp.diff_cleanupEfficiency(diff_lines)
|
||||||
|
|
||||||
|
patches = dmp.patch_make(search_lines, diff_lines)
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
diff = list(diff_lines)
|
||||||
|
dmp.diff_charsToLines(diff, mapping)
|
||||||
|
dump(diff)
|
||||||
|
html = dmp.diff_prettyHtml(diff)
|
||||||
|
Path("tmp.search_replace_diff.html").write_text(html)
|
||||||
|
|
||||||
|
for d in diff:
|
||||||
|
print(d[0], repr(d[1]))
|
||||||
|
|
||||||
|
new_lines, success = dmp.patch_apply(patches, original_lines)
|
||||||
|
new_text = lines_to_chars(new_lines, mapping)
|
||||||
|
|
||||||
|
all_success = False not in success
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
# print(new_text)
|
||||||
|
dump(success)
|
||||||
|
dump(all_success)
|
||||||
|
|
||||||
|
# print(new_text)
|
||||||
|
|
||||||
|
if not all_success:
|
||||||
|
return
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def diff_lines(search_text, replace_text):
|
||||||
|
dmp = diff_match_patch()
|
||||||
|
dmp.Diff_Timeout = 5
|
||||||
|
# dmp.Diff_EditCost = 16
|
||||||
|
search_lines, replace_lines, mapping = dmp.diff_linesToChars(search_text, replace_text)
|
||||||
|
|
||||||
|
diff_lines = dmp.diff_main(search_lines, replace_lines, None)
|
||||||
|
dmp.diff_cleanupSemantic(diff_lines)
|
||||||
|
dmp.diff_cleanupEfficiency(diff_lines)
|
||||||
|
|
||||||
|
diff = list(diff_lines)
|
||||||
|
dmp.diff_charsToLines(diff, mapping)
|
||||||
|
dump(diff)
|
||||||
|
|
||||||
|
udiff = []
|
||||||
|
for d, lines in diff:
|
||||||
|
if d < 0:
|
||||||
|
d = "-"
|
||||||
|
elif d > 0:
|
||||||
|
d = "+"
|
||||||
|
else:
|
||||||
|
d = " "
|
||||||
|
for line in lines.splitlines(keepends=True):
|
||||||
|
udiff.append(d + line)
|
||||||
|
|
||||||
|
return udiff
|
||||||
|
|
||||||
|
|
||||||
|
def search_and_replace(texts):
|
||||||
|
search_text, replace_text, original_text = texts
|
||||||
|
|
||||||
|
num = original_text.count(search_text)
|
||||||
|
# if num > 1:
|
||||||
|
# raise SearchTextNotUnique()
|
||||||
|
if num == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_text = original_text.replace(search_text, replace_text)
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def git_cherry_pick_osr_onto_o(texts):
|
||||||
|
search_text, replace_text, original_text = texts
|
||||||
|
|
||||||
|
with GitTemporaryDirectory() as dname:
|
||||||
|
repo = git.Repo(dname)
|
||||||
|
|
||||||
|
fname = Path(dname) / "file.txt"
|
||||||
|
|
||||||
|
# Make O->S->R
|
||||||
|
fname.write_text(original_text)
|
||||||
|
repo.git.add(str(fname))
|
||||||
|
repo.git.commit("-m", "original")
|
||||||
|
original_hash = repo.head.commit.hexsha
|
||||||
|
|
||||||
|
fname.write_text(search_text)
|
||||||
|
repo.git.add(str(fname))
|
||||||
|
repo.git.commit("-m", "search")
|
||||||
|
|
||||||
|
fname.write_text(replace_text)
|
||||||
|
repo.git.add(str(fname))
|
||||||
|
repo.git.commit("-m", "replace")
|
||||||
|
replace_hash = repo.head.commit.hexsha
|
||||||
|
|
||||||
|
# go back to O
|
||||||
|
repo.git.checkout(original_hash)
|
||||||
|
|
||||||
|
# cherry pick R onto original
|
||||||
|
try:
|
||||||
|
repo.git.cherry_pick(replace_hash, "--minimal")
|
||||||
|
except git.exc.GitCommandError:
|
||||||
|
# merge conflicts!
|
||||||
|
return
|
||||||
|
|
||||||
|
new_text = fname.read_text()
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def git_cherry_pick_sr_onto_so(texts):
|
||||||
|
search_text, replace_text, original_text = texts
|
||||||
|
|
||||||
|
with GitTemporaryDirectory() as dname:
|
||||||
|
repo = git.Repo(dname)
|
||||||
|
|
||||||
|
fname = Path(dname) / "file.txt"
|
||||||
|
|
||||||
|
fname.write_text(search_text)
|
||||||
|
repo.git.add(str(fname))
|
||||||
|
repo.git.commit("-m", "search")
|
||||||
|
search_hash = repo.head.commit.hexsha
|
||||||
|
|
||||||
|
# make search->replace
|
||||||
|
fname.write_text(replace_text)
|
||||||
|
repo.git.add(str(fname))
|
||||||
|
repo.git.commit("-m", "replace")
|
||||||
|
replace_hash = repo.head.commit.hexsha
|
||||||
|
|
||||||
|
# go back to search,
|
||||||
|
repo.git.checkout(search_hash)
|
||||||
|
|
||||||
|
# make search->original
|
||||||
|
fname.write_text(original_text)
|
||||||
|
repo.git.add(str(fname))
|
||||||
|
repo.git.commit("-m", "original")
|
||||||
|
|
||||||
|
# cherry pick replace onto original
|
||||||
|
try:
|
||||||
|
repo.git.cherry_pick(replace_hash, "--minimal")
|
||||||
|
except git.exc.GitCommandError:
|
||||||
|
# merge conflicts!
|
||||||
|
return
|
||||||
|
|
||||||
|
new_text = fname.read_text()
|
||||||
|
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
class SearchTextNotUnique(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
all_preprocs = [
|
||||||
|
# (strip_blank_lines, relative_indent, reverse_lines)
|
||||||
|
(False, False, False),
|
||||||
|
(True, False, False),
|
||||||
|
(False, True, False),
|
||||||
|
(True, True, False),
|
||||||
|
# (False, False, True),
|
||||||
|
# (True, False, True),
|
||||||
|
# (False, True, True),
|
||||||
|
# (True, True, True),
|
||||||
|
]
|
||||||
|
|
||||||
|
always_relative_indent = [
|
||||||
|
(False, True, False),
|
||||||
|
(True, True, False),
|
||||||
|
# (False, True, True),
|
||||||
|
# (True, True, True),
|
||||||
|
]
|
||||||
|
|
||||||
|
editblock_strategies = [
|
||||||
|
(search_and_replace, all_preprocs),
|
||||||
|
(git_cherry_pick_osr_onto_o, all_preprocs),
|
||||||
|
(dmp_lines_apply, all_preprocs),
|
||||||
|
]
|
||||||
|
|
||||||
|
never_relative = [
|
||||||
|
(False, False),
|
||||||
|
(True, False),
|
||||||
|
]
|
||||||
|
|
||||||
|
udiff_strategies = [
|
||||||
|
(search_and_replace, all_preprocs),
|
||||||
|
(git_cherry_pick_osr_onto_o, all_preprocs),
|
||||||
|
(dmp_lines_apply, all_preprocs),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def flexible_search_and_replace(texts, strategies):
|
||||||
|
"""Try a series of search/replace methods, starting from the most
|
||||||
|
literal interpretation of search_text. If needed, progress to more
|
||||||
|
flexible methods, which can accommodate divergence between
|
||||||
|
search_text and original_text and yet still achieve the desired
|
||||||
|
edits.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for strategy, preprocs in strategies:
|
||||||
|
for preproc in preprocs:
|
||||||
|
res = try_strategy(texts, strategy, preproc)
|
||||||
|
if res:
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_lines(text):
|
||||||
|
lines = text.splitlines(keepends=True)
|
||||||
|
lines.reverse()
|
||||||
|
return "".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def try_strategy(texts, strategy, preproc):
|
||||||
|
preproc_strip_blank_lines, preproc_relative_indent, preproc_reverse = preproc
|
||||||
|
ri = None
|
||||||
|
|
||||||
|
if preproc_strip_blank_lines:
|
||||||
|
texts = strip_blank_lines(texts)
|
||||||
|
if preproc_relative_indent:
|
||||||
|
ri, texts = relative_indent(texts)
|
||||||
|
if preproc_reverse:
|
||||||
|
texts = list(map(reverse_lines, texts))
|
||||||
|
|
||||||
|
res = strategy(texts)
|
||||||
|
|
||||||
|
if res and preproc_reverse:
|
||||||
|
res = reverse_lines(res)
|
||||||
|
|
||||||
|
if res and preproc_relative_indent:
|
||||||
|
try:
|
||||||
|
res = ri.make_absolute(res)
|
||||||
|
except ValueError:
|
||||||
|
return
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def strip_blank_lines(texts):
|
||||||
|
# strip leading and trailing blank lines
|
||||||
|
texts = [text.strip("\n") + "\n" for text in texts]
|
||||||
|
return texts
|
||||||
|
|
||||||
|
|
||||||
|
def read_text(fname):
|
||||||
|
text = Path(fname).read_text()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def proc(dname):
|
||||||
|
dname = Path(dname)
|
||||||
|
|
||||||
|
try:
|
||||||
|
search_text = read_text(dname / "search")
|
||||||
|
replace_text = read_text(dname / "replace")
|
||||||
|
original_text = read_text(dname / "original")
|
||||||
|
except FileNotFoundError:
|
||||||
|
return
|
||||||
|
|
||||||
|
####
|
||||||
|
|
||||||
|
texts = search_text, replace_text, original_text
|
||||||
|
|
||||||
|
strategies = [
|
||||||
|
# (search_and_replace, all_preprocs),
|
||||||
|
# (git_cherry_pick_osr_onto_o, all_preprocs),
|
||||||
|
# (git_cherry_pick_sr_onto_so, all_preprocs),
|
||||||
|
# (dmp_apply, all_preprocs),
|
||||||
|
(dmp_lines_apply, all_preprocs),
|
||||||
|
]
|
||||||
|
|
||||||
|
_strategies = editblock_strategies # noqa: F841
|
||||||
|
|
||||||
|
short_names = dict(
|
||||||
|
search_and_replace="sr",
|
||||||
|
git_cherry_pick_osr_onto_o="cp_o",
|
||||||
|
git_cherry_pick_sr_onto_so="cp_so",
|
||||||
|
dmp_apply="dmp",
|
||||||
|
dmp_lines_apply="dmpl",
|
||||||
|
)
|
||||||
|
|
||||||
|
patched = dict()
|
||||||
|
for strategy, preprocs in strategies:
|
||||||
|
for preproc in preprocs:
|
||||||
|
method = strategy.__name__
|
||||||
|
method = short_names[method]
|
||||||
|
|
||||||
|
strip_blank, rel_indent, rev_lines = preproc
|
||||||
|
if strip_blank or rel_indent:
|
||||||
|
method += "_"
|
||||||
|
if strip_blank:
|
||||||
|
method += "s"
|
||||||
|
if rel_indent:
|
||||||
|
method += "i"
|
||||||
|
if rev_lines:
|
||||||
|
method += "r"
|
||||||
|
|
||||||
|
res = try_strategy(texts, strategy, preproc)
|
||||||
|
patched[method] = res
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for method, res in patched.items():
|
||||||
|
out_fname = dname / f"original.{method}"
|
||||||
|
if out_fname.exists():
|
||||||
|
out_fname.unlink()
|
||||||
|
|
||||||
|
if res:
|
||||||
|
out_fname.write_text(res)
|
||||||
|
|
||||||
|
correct = (dname / "correct").read_text()
|
||||||
|
if res == correct:
|
||||||
|
res = "pass"
|
||||||
|
else:
|
||||||
|
res = "WRONG"
|
||||||
|
else:
|
||||||
|
res = "fail"
|
||||||
|
|
||||||
|
results.append((method, res))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def colorize_result(result):
|
||||||
|
colors = {
|
||||||
|
"pass": "\033[102;30mpass\033[0m", # Green background, black text
|
||||||
|
"WRONG": "\033[101;30mWRONG\033[0m", # Red background, black text
|
||||||
|
"fail": "\033[103;30mfail\033[0m", # Yellow background, black text
|
||||||
|
}
|
||||||
|
return colors.get(result, result) # Default to original result if not found
|
||||||
|
|
||||||
|
|
||||||
|
def main(dnames):
|
||||||
|
all_results = []
|
||||||
|
for dname in tqdm(dnames):
|
||||||
|
dname = Path(dname)
|
||||||
|
results = proc(dname)
|
||||||
|
for method, res in results:
|
||||||
|
all_results.append((dname, method, res))
|
||||||
|
# print(dname, method, colorize_result(res))
|
||||||
|
|
||||||
|
# Create a 2D table with directories along the right and methods along the top
|
||||||
|
# Collect all unique methods and directories
|
||||||
|
methods = []
|
||||||
|
for _, method, _ in all_results:
|
||||||
|
if method not in methods:
|
||||||
|
methods.append(method)
|
||||||
|
|
||||||
|
directories = dnames
|
||||||
|
|
||||||
|
# Sort directories by decreasing number of 'pass' results
|
||||||
|
pass_counts = {
|
||||||
|
dname: sum(
|
||||||
|
res == "pass" for dname_result, _, res in all_results if str(dname) == str(dname_result)
|
||||||
|
)
|
||||||
|
for dname in directories
|
||||||
|
}
|
||||||
|
directories.sort(key=lambda dname: pass_counts[dname], reverse=True)
|
||||||
|
|
||||||
|
# Create a results matrix
|
||||||
|
results_matrix = {dname: {method: "" for method in methods} for dname in directories}
|
||||||
|
|
||||||
|
# Populate the results matrix
|
||||||
|
for dname, method, res in all_results:
|
||||||
|
results_matrix[str(dname)][method] = res
|
||||||
|
|
||||||
|
# Print the 2D table
|
||||||
|
# Print the header
|
||||||
|
print("{:<20}".format("Directory"), end="")
|
||||||
|
for method in methods:
|
||||||
|
print("{:<9}".format(method), end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Print the rows with colorized results
|
||||||
|
for dname in directories:
|
||||||
|
print("{:<20}".format(Path(dname).name), end="")
|
||||||
|
for method in methods:
|
||||||
|
res = results_matrix[dname][method]
|
||||||
|
colorized_res = colorize_result(res)
|
||||||
|
res_l = 9 + len(colorized_res) - len(res)
|
||||||
|
fmt = "{:<" + str(res_l) + "}"
|
||||||
|
print(fmt.format(colorized_res), end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
status = main(sys.argv[1:])
|
||||||
|
sys.exit(status)
|
395
aider/coders/udiff_coder.py
Normal file
395
aider/coders/udiff_coder.py
Normal file
|
@ -0,0 +1,395 @@
|
||||||
|
import difflib
|
||||||
|
from itertools import groupby
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..dump import dump # noqa: F401
|
||||||
|
from .base_coder import Coder
|
||||||
|
from .search_replace import (
|
||||||
|
SearchTextNotUnique,
|
||||||
|
all_preprocs,
|
||||||
|
diff_lines,
|
||||||
|
flexible_search_and_replace,
|
||||||
|
search_and_replace,
|
||||||
|
)
|
||||||
|
from .udiff_prompts import UnifiedDiffPrompts
|
||||||
|
|
||||||
|
no_match_error = """UnifiedDiffNoMatch: hunk failed to apply!
|
||||||
|
|
||||||
|
{path} does not contain lines that match the diff you provided!
|
||||||
|
Try again.
|
||||||
|
DO NOT skip blank lines, comments, docstrings, etc!
|
||||||
|
The diff needs to apply cleanly to the lines in {path}!
|
||||||
|
|
||||||
|
{path} does not contain these {num_lines} exact lines in a row:
|
||||||
|
```
|
||||||
|
{original}```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
not_unique_error = """UnifiedDiffNotUnique: hunk failed to apply!
|
||||||
|
|
||||||
|
{path} contains multiple sets of lines that match the diff you provided!
|
||||||
|
Try again.
|
||||||
|
Use additional ` ` lines to provide context that uniquely indicates which code needs to be changed.
|
||||||
|
The diff needs to apply to a unique set of lines in {path}!
|
||||||
|
|
||||||
|
{path} contains multiple copies of these {num_lines} lines:
|
||||||
|
```
|
||||||
|
{original}```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class UnifiedDiffCoder(Coder):
|
||||||
|
edit_format = "udiff"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.gpt_prompts = UnifiedDiffPrompts()
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def get_edits(self):
|
||||||
|
content = self.partial_response_content
|
||||||
|
|
||||||
|
# might raise ValueError for malformed ORIG/UPD blocks
|
||||||
|
raw_edits = list(find_diffs(content))
|
||||||
|
|
||||||
|
last_path = None
|
||||||
|
edits = []
|
||||||
|
for path, hunk in raw_edits:
|
||||||
|
if path:
|
||||||
|
last_path = path
|
||||||
|
else:
|
||||||
|
path = last_path
|
||||||
|
edits.append((path, hunk))
|
||||||
|
|
||||||
|
return edits
|
||||||
|
|
||||||
|
def apply_edits(self, edits):
|
||||||
|
seen = set()
|
||||||
|
uniq = []
|
||||||
|
for path, hunk in edits:
|
||||||
|
hunk = normalize_hunk(hunk)
|
||||||
|
if not hunk:
|
||||||
|
continue
|
||||||
|
|
||||||
|
this = [path + "\n"] + hunk
|
||||||
|
this = "".join(this)
|
||||||
|
|
||||||
|
if this in seen:
|
||||||
|
continue
|
||||||
|
seen.add(this)
|
||||||
|
|
||||||
|
uniq.append((path, hunk))
|
||||||
|
|
||||||
|
errors = []
|
||||||
|
for path, hunk in uniq:
|
||||||
|
full_path = self.abs_root_path(path)
|
||||||
|
content = self.io.read_text(full_path)
|
||||||
|
|
||||||
|
original, _ = hunk_to_before_after(hunk)
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = do_replace(full_path, content, hunk)
|
||||||
|
except SearchTextNotUnique:
|
||||||
|
errors.append(
|
||||||
|
not_unique_error.format(
|
||||||
|
path=path, original=original, num_lines=len(original.splitlines())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
errors.append(
|
||||||
|
no_match_error.format(
|
||||||
|
path=path, original=original, num_lines=len(original.splitlines())
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# SUCCESS!
|
||||||
|
self.io.write_text(full_path, content)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
errors = "\n\n".join(errors)
|
||||||
|
raise ValueError(errors)
|
||||||
|
|
||||||
|
|
||||||
|
def do_replace(fname, content, hunk):
|
||||||
|
fname = Path(fname)
|
||||||
|
|
||||||
|
before_text, after_text = hunk_to_before_after(hunk)
|
||||||
|
|
||||||
|
# does it want to make a new file?
|
||||||
|
if not fname.exists() and not before_text.strip():
|
||||||
|
fname.touch()
|
||||||
|
content = ""
|
||||||
|
|
||||||
|
if content is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# TODO: handle inserting into new file
|
||||||
|
if not before_text.strip():
|
||||||
|
# append to existing file, or start a new file
|
||||||
|
new_content = content + after_text
|
||||||
|
return new_content
|
||||||
|
|
||||||
|
new_content = None
|
||||||
|
|
||||||
|
new_content = apply_hunk(content, hunk)
|
||||||
|
if new_content:
|
||||||
|
return new_content
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_repeats(s):
|
||||||
|
return "".join(k for k, g in groupby(s))
|
||||||
|
|
||||||
|
|
||||||
|
def apply_hunk(content, hunk):
|
||||||
|
before_text, after_text = hunk_to_before_after(hunk)
|
||||||
|
|
||||||
|
res = directly_apply_hunk(content, hunk)
|
||||||
|
if res:
|
||||||
|
return res
|
||||||
|
|
||||||
|
hunk = make_new_lines_explicit(content, hunk)
|
||||||
|
|
||||||
|
# just consider space vs not-space
|
||||||
|
ops = "".join([line[0] for line in hunk])
|
||||||
|
ops = ops.replace("-", "x")
|
||||||
|
ops = ops.replace("+", "x")
|
||||||
|
ops = ops.replace("\n", " ")
|
||||||
|
|
||||||
|
cur_op = " "
|
||||||
|
section = []
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
for i in range(len(ops)):
|
||||||
|
op = ops[i]
|
||||||
|
if op != cur_op:
|
||||||
|
sections.append(section)
|
||||||
|
section = []
|
||||||
|
cur_op = op
|
||||||
|
section.append(hunk[i])
|
||||||
|
|
||||||
|
sections.append(section)
|
||||||
|
if cur_op != " ":
|
||||||
|
sections.append([])
|
||||||
|
|
||||||
|
all_done = True
|
||||||
|
for i in range(2, len(sections), 2):
|
||||||
|
preceding_context = sections[i - 2]
|
||||||
|
changes = sections[i - 1]
|
||||||
|
following_context = sections[i]
|
||||||
|
|
||||||
|
res = apply_partial_hunk(content, preceding_context, changes, following_context)
|
||||||
|
if res:
|
||||||
|
content = res
|
||||||
|
else:
|
||||||
|
all_done = False
|
||||||
|
# FAILED!
|
||||||
|
# this_hunk = preceding_context + changes + following_context
|
||||||
|
break
|
||||||
|
|
||||||
|
if all_done:
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def flexi_just_search_and_replace(texts):
|
||||||
|
strategies = [
|
||||||
|
(search_and_replace, all_preprocs),
|
||||||
|
]
|
||||||
|
|
||||||
|
return flexible_search_and_replace(texts, strategies)
|
||||||
|
|
||||||
|
|
||||||
|
def make_new_lines_explicit(content, hunk):
|
||||||
|
before, after = hunk_to_before_after(hunk)
|
||||||
|
|
||||||
|
diff = diff_lines(before, content)
|
||||||
|
|
||||||
|
back_diff = []
|
||||||
|
for line in diff:
|
||||||
|
if line[0] == "+":
|
||||||
|
continue
|
||||||
|
# if line[0] == "-":
|
||||||
|
# line = "+" + line[1:]
|
||||||
|
|
||||||
|
back_diff.append(line)
|
||||||
|
|
||||||
|
new_before = directly_apply_hunk(before, back_diff)
|
||||||
|
if not new_before:
|
||||||
|
return hunk
|
||||||
|
|
||||||
|
if len(new_before.strip()) < 10:
|
||||||
|
return hunk
|
||||||
|
|
||||||
|
before = before.splitlines(keepends=True)
|
||||||
|
new_before = new_before.splitlines(keepends=True)
|
||||||
|
after = after.splitlines(keepends=True)
|
||||||
|
|
||||||
|
if len(new_before) < len(before) * 0.66:
|
||||||
|
return hunk
|
||||||
|
|
||||||
|
new_hunk = difflib.unified_diff(new_before, after, n=max(len(new_before), len(after)))
|
||||||
|
new_hunk = list(new_hunk)[3:]
|
||||||
|
|
||||||
|
return new_hunk
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_pure_whitespace_lines(lines):
|
||||||
|
res = [
|
||||||
|
line if line.strip() else line[-(len(line) - len(line.rstrip("\r\n")))] for line in lines
|
||||||
|
]
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_hunk(hunk):
|
||||||
|
before, after = hunk_to_before_after(hunk, lines=True)
|
||||||
|
|
||||||
|
before = cleanup_pure_whitespace_lines(before)
|
||||||
|
after = cleanup_pure_whitespace_lines(after)
|
||||||
|
|
||||||
|
diff = difflib.unified_diff(before, after, n=max(len(before), len(after)))
|
||||||
|
diff = list(diff)[3:]
|
||||||
|
return diff
|
||||||
|
|
||||||
|
|
||||||
|
def directly_apply_hunk(content, hunk):
|
||||||
|
before, after = hunk_to_before_after(hunk)
|
||||||
|
|
||||||
|
before_lines, _ = hunk_to_before_after(hunk, lines=True)
|
||||||
|
before_lines = "".join([line.strip() for line in before_lines])
|
||||||
|
|
||||||
|
# Refuse to do a repeated search and replace on a tiny bit of non-whitespace context
|
||||||
|
if len(before_lines) < 10 and content.count(before) > 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
new_content = flexi_just_search_and_replace([before, after, content])
|
||||||
|
except SearchTextNotUnique:
|
||||||
|
new_content = None
|
||||||
|
|
||||||
|
return new_content
|
||||||
|
|
||||||
|
|
||||||
|
def apply_partial_hunk(content, preceding_context, changes, following_context):
|
||||||
|
len_prec = len(preceding_context)
|
||||||
|
len_foll = len(following_context)
|
||||||
|
|
||||||
|
use_all = len_prec + len_foll
|
||||||
|
|
||||||
|
for drop in range(use_all):
|
||||||
|
use = use_all - drop
|
||||||
|
|
||||||
|
for use_prec in range(len_prec, -1, -1):
|
||||||
|
if use_prec > use:
|
||||||
|
continue
|
||||||
|
|
||||||
|
use_foll = use - use_prec
|
||||||
|
if use_foll > len_foll:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if use_prec:
|
||||||
|
this_prec = preceding_context[-use_prec:]
|
||||||
|
else:
|
||||||
|
this_prec = []
|
||||||
|
|
||||||
|
this_foll = following_context[:use_foll]
|
||||||
|
|
||||||
|
res = directly_apply_hunk(content, this_prec + changes + this_foll)
|
||||||
|
if res:
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def find_diffs(content):
|
||||||
|
# We can always use triple-quotes, because all the udiff content
|
||||||
|
# is prefixed with +/-/space.
|
||||||
|
|
||||||
|
if not content.endswith("\n"):
|
||||||
|
content = content + "\n"
|
||||||
|
|
||||||
|
lines = content.splitlines(keepends=True)
|
||||||
|
line_num = 0
|
||||||
|
edits = []
|
||||||
|
while line_num < len(lines):
|
||||||
|
while line_num < len(lines):
|
||||||
|
line = lines[line_num]
|
||||||
|
if line.startswith("```diff"):
|
||||||
|
line_num, these_edits = process_fenced_block(lines, line_num + 1)
|
||||||
|
edits += these_edits
|
||||||
|
break
|
||||||
|
line_num += 1
|
||||||
|
|
||||||
|
# For now, just take 1!
|
||||||
|
# edits = edits[:1]
|
||||||
|
|
||||||
|
return edits
|
||||||
|
|
||||||
|
|
||||||
|
def process_fenced_block(lines, start_line_num):
|
||||||
|
for line_num in range(start_line_num, len(lines)):
|
||||||
|
line = lines[line_num]
|
||||||
|
if line.startswith("```"):
|
||||||
|
break
|
||||||
|
|
||||||
|
block = lines[start_line_num:line_num]
|
||||||
|
block.append("@@ @@")
|
||||||
|
|
||||||
|
if block[1].startswith("+++ "):
|
||||||
|
fname = block[1].split()[1]
|
||||||
|
block = block[2:]
|
||||||
|
else:
|
||||||
|
fname = None
|
||||||
|
|
||||||
|
edits = []
|
||||||
|
|
||||||
|
keeper = False
|
||||||
|
hunk = []
|
||||||
|
op = " "
|
||||||
|
for line in block:
|
||||||
|
hunk.append(line)
|
||||||
|
if len(line) < 2:
|
||||||
|
continue
|
||||||
|
op = line[0]
|
||||||
|
if op in "-+":
|
||||||
|
keeper = True
|
||||||
|
continue
|
||||||
|
if op != "@":
|
||||||
|
continue
|
||||||
|
if not keeper:
|
||||||
|
hunk = []
|
||||||
|
continue
|
||||||
|
|
||||||
|
hunk = hunk[:-1]
|
||||||
|
edits.append((fname, hunk))
|
||||||
|
hunk = []
|
||||||
|
|
||||||
|
return line_num + 1, edits
|
||||||
|
|
||||||
|
|
||||||
|
def hunk_to_before_after(hunk, lines=False):
|
||||||
|
before = []
|
||||||
|
after = []
|
||||||
|
op = " "
|
||||||
|
for line in hunk:
|
||||||
|
if len(line) < 2:
|
||||||
|
op = " "
|
||||||
|
line = line
|
||||||
|
else:
|
||||||
|
op = line[0]
|
||||||
|
line = line[1:]
|
||||||
|
|
||||||
|
if op == " ":
|
||||||
|
before.append(line)
|
||||||
|
after.append(line)
|
||||||
|
elif op == "-":
|
||||||
|
before.append(line)
|
||||||
|
elif op == "+":
|
||||||
|
after.append(line)
|
||||||
|
|
||||||
|
if lines:
|
||||||
|
return before, after
|
||||||
|
|
||||||
|
before = "".join(before)
|
||||||
|
after = "".join(after)
|
||||||
|
|
||||||
|
return before, after
|
107
aider/coders/udiff_prompts.py
Normal file
107
aider/coders/udiff_prompts.py
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
# flake8: noqa: E501
|
||||||
|
|
||||||
|
from .base_prompts import CoderPrompts
|
||||||
|
|
||||||
|
|
||||||
|
class UnifiedDiffPrompts(CoderPrompts):
|
||||||
|
main_system = """Act as an expert software developer.
|
||||||
|
You are diligent and tireless, and you always COMPLETELY IMPLEMENT the needed code.
|
||||||
|
Always use best practices when coding.
|
||||||
|
Respect and use existing conventions, libraries, etc that are already present in the code base.
|
||||||
|
|
||||||
|
Take requests for changes to the supplied code.
|
||||||
|
If the request is ambiguous, ask questions.
|
||||||
|
|
||||||
|
For each file that needs to be changed, write out the changes similar to a unified diff like `diff -U0` would produce. For example:
|
||||||
|
|
||||||
|
# Example conversation 1
|
||||||
|
|
||||||
|
## USER: Replace is_prime with a call to sympy.
|
||||||
|
|
||||||
|
## ASSISTANT: Ok, I will:
|
||||||
|
|
||||||
|
1. Add an imports of sympy.
|
||||||
|
2. Remove the is_prime() function.
|
||||||
|
3. Replace the existing call to is_prime() with a call to sympy.isprime().
|
||||||
|
|
||||||
|
Here are the diffs for those changes:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
--- mathweb/flask/app.py
|
||||||
|
+++ mathweb/flask/app.py
|
||||||
|
@@ ... @@
|
||||||
|
-class MathWeb:
|
||||||
|
+import sympy
|
||||||
|
+
|
||||||
|
+class MathWeb:
|
||||||
|
@@ ... @@
|
||||||
|
-def is_prime(x):
|
||||||
|
- if x < 2:
|
||||||
|
- return False
|
||||||
|
- for i in range(2, int(math.sqrt(x)) + 1):
|
||||||
|
- if x % i == 0:
|
||||||
|
- return False
|
||||||
|
- return True
|
||||||
|
@@ ... @@
|
||||||
|
-@app.route('/prime/<int:n>')
|
||||||
|
-def nth_prime(n):
|
||||||
|
- count = 0
|
||||||
|
- num = 1
|
||||||
|
- while count < n:
|
||||||
|
- num += 1
|
||||||
|
- if is_prime(num):
|
||||||
|
- count += 1
|
||||||
|
- return str(num)
|
||||||
|
+@app.route('/prime/<int:n>')
|
||||||
|
+def nth_prime(n):
|
||||||
|
+ count = 0
|
||||||
|
+ num = 1
|
||||||
|
+ while count < n:
|
||||||
|
+ num += 1
|
||||||
|
+ if sympy.isprime(num):
|
||||||
|
+ count += 1
|
||||||
|
+ return str(num)
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
system_reminder = """# File editing rules:
|
||||||
|
|
||||||
|
Return edits similar to unified diffs that `diff -U0` would produce.
|
||||||
|
|
||||||
|
Make sure you include the first 2 lines with the file paths.
|
||||||
|
Don't include timestamps with the file paths.
|
||||||
|
|
||||||
|
Start each hunk of changes with a `@@ ... @@` line.
|
||||||
|
Don't include line numbers like `diff -U0` does.
|
||||||
|
The user's patch tool doesn't need them.
|
||||||
|
|
||||||
|
The user's patch tool needs CORRECT patches that apply cleanly against the current contents of the file!
|
||||||
|
Think carefully and make sure you include and mark all lines that need to be removed or changed as `-` lines.
|
||||||
|
Make sure you mark all new or modified lines with `+`.
|
||||||
|
Don't leave out any lines or the diff patch won't apply correctly.
|
||||||
|
|
||||||
|
Indentation matters in the diffs!
|
||||||
|
|
||||||
|
Start a new hunk for each section of the file that needs changes.
|
||||||
|
|
||||||
|
Only output hunks that specify changes with `+` or `-` lines.
|
||||||
|
Skip any hunks that are entirely unchanging ` ` lines.
|
||||||
|
|
||||||
|
Output hunks in whatever order makes the most sense.
|
||||||
|
Hunks don't need to be in any particular order.
|
||||||
|
|
||||||
|
When editing a function, method, loop, etc use a hunk to replace the *entire* code block.
|
||||||
|
Delete the entire existing version with `-` lines and then add a new, updated version with `+` lines.
|
||||||
|
This will help you generate correct code and correct diffs.
|
||||||
|
|
||||||
|
To make a new file, show a diff from `--- /dev/null` to `+++ path/to/new/file.ext`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
files_content_prefix = "These are the *read-write* files:\n"
|
||||||
|
|
||||||
|
files_no_full_files = "I am not sharing any *read-write* files yet."
|
||||||
|
|
||||||
|
repo_content_prefix = """Below here are summaries of other files present in this git repository.
|
||||||
|
Do not propose changes to these files, they are *read-only*.
|
||||||
|
To make a file *read-write*, ask the user to *add it to the chat*.
|
||||||
|
"""
|
|
@ -8,6 +8,8 @@ from .wholefile_prompts import WholeFilePrompts
|
||||||
|
|
||||||
|
|
||||||
class WholeFileCoder(Coder):
|
class WholeFileCoder(Coder):
|
||||||
|
edit_format = "whole"
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.gpt_prompts = WholeFilePrompts()
|
self.gpt_prompts = WholeFilePrompts()
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
|
@ -148,7 +148,7 @@ def main(argv=None, input=None, output=None, force_git_root=None):
|
||||||
core_group.add_argument(
|
core_group.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
metavar="MODEL",
|
metavar="MODEL",
|
||||||
default=models.GPT4.name,
|
default=models.GPT4_0613.name,
|
||||||
help=f"Specify the model to use for the main chat (default: {models.GPT4.name})",
|
help=f"Specify the model to use for the main chat (default: {models.GPT4.name})",
|
||||||
)
|
)
|
||||||
core_group.add_argument(
|
core_group.add_argument(
|
||||||
|
@ -157,6 +157,14 @@ def main(argv=None, input=None, output=None, force_git_root=None):
|
||||||
default=False,
|
default=False,
|
||||||
help="Override to skip model availability check (default: False)",
|
help="Override to skip model availability check (default: False)",
|
||||||
)
|
)
|
||||||
|
default_4_turbo_model = models.GPT4_1106_PREVIEW
|
||||||
|
core_group.add_argument(
|
||||||
|
"--4-turbo",
|
||||||
|
action="store_const",
|
||||||
|
dest="model",
|
||||||
|
const=default_4_turbo_model.name,
|
||||||
|
help=f"Use {default_4_turbo_model.name} model for the main chat (gpt-4 is better)",
|
||||||
|
)
|
||||||
default_3_model = models.GPT35_1106
|
default_3_model = models.GPT35_1106
|
||||||
core_group.add_argument(
|
core_group.add_argument(
|
||||||
"-3",
|
"-3",
|
||||||
|
@ -380,7 +388,10 @@ def main(argv=None, input=None, output=None, force_git_root=None):
|
||||||
"--message-file",
|
"--message-file",
|
||||||
"-f",
|
"-f",
|
||||||
metavar="MESSAGE_FILE",
|
metavar="MESSAGE_FILE",
|
||||||
help="Specify a file containing the message to send GPT, process reply, then exit (disables chat mode)",
|
help=(
|
||||||
|
"Specify a file containing the message to send GPT, process reply, then exit (disables"
|
||||||
|
" chat mode)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
other_group.add_argument(
|
other_group.add_argument(
|
||||||
"--encoding",
|
"--encoding",
|
||||||
|
|
|
@ -3,6 +3,8 @@ from .openai import OpenAIModel
|
||||||
from .openrouter import OpenRouterModel
|
from .openrouter import OpenRouterModel
|
||||||
|
|
||||||
GPT4 = Model.create("gpt-4")
|
GPT4 = Model.create("gpt-4")
|
||||||
|
GPT4_0613 = Model.create("gpt-4-0613")
|
||||||
|
GPT4_1106_PREVIEW = Model.create("gpt-4-1106-preview")
|
||||||
GPT35 = Model.create("gpt-3.5-turbo")
|
GPT35 = Model.create("gpt-3.5-turbo")
|
||||||
GPT35_1106 = Model.create("gpt-3.5-turbo-1106")
|
GPT35_1106 = Model.create("gpt-3.5-turbo-1106")
|
||||||
GPT35_16k = Model.create("gpt-3.5-turbo-16k")
|
GPT35_16k = Model.create("gpt-3.5-turbo-16k")
|
||||||
|
|
|
@ -33,7 +33,11 @@ class OpenAIModel(Model):
|
||||||
self.tokenizer = tiktoken.encoding_for_model(name)
|
self.tokenizer = tiktoken.encoding_for_model(name)
|
||||||
|
|
||||||
if self.is_gpt4():
|
if self.is_gpt4():
|
||||||
self.edit_format = "diff"
|
if name == "gpt-4-1106-preview":
|
||||||
|
self.edit_format = "udiff"
|
||||||
|
else:
|
||||||
|
self.edit_format = "diff"
|
||||||
|
|
||||||
self.use_repo_map = True
|
self.use_repo_map = True
|
||||||
self.send_undo_reply = True
|
self.send_undo_reply = True
|
||||||
|
|
||||||
|
@ -44,11 +48,11 @@ class OpenAIModel(Model):
|
||||||
elif tokens == 32:
|
elif tokens == 32:
|
||||||
self.prompt_price = 0.06
|
self.prompt_price = 0.06
|
||||||
self.completion_price = 0.12
|
self.completion_price = 0.12
|
||||||
self.max_chat_history_tokens = 3 * 1024
|
self.max_chat_history_tokens = 2 * 1024
|
||||||
elif tokens == 128:
|
elif tokens == 128:
|
||||||
self.prompt_price = 0.01
|
self.prompt_price = 0.01
|
||||||
self.completion_price = 0.03
|
self.completion_price = 0.03
|
||||||
self.max_chat_history_tokens = 4 * 1024
|
self.max_chat_history_tokens = 2 * 1024
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -60,7 +64,7 @@ class OpenAIModel(Model):
|
||||||
if self.name == "gpt-3.5-turbo-1106":
|
if self.name == "gpt-3.5-turbo-1106":
|
||||||
self.prompt_price = 0.001
|
self.prompt_price = 0.001
|
||||||
self.completion_price = 0.002
|
self.completion_price = 0.002
|
||||||
self.max_chat_history_tokens = 3 * 1024
|
self.max_chat_history_tokens = 2 * 1024
|
||||||
elif tokens == 4:
|
elif tokens == 4:
|
||||||
self.prompt_price = 0.0015
|
self.prompt_price = 0.0015
|
||||||
self.completion_price = 0.002
|
self.completion_price = 0.002
|
||||||
|
|
|
@ -1,9 +1,69 @@
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import git
|
||||||
|
|
||||||
# Set of image file extensions
|
|
||||||
IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp'}
|
IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp'}
|
||||||
|
|
||||||
from .dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
class IgnorantTemporaryDirectory:
|
||||||
|
def __init__(self):
|
||||||
|
self.temp_dir = tempfile.TemporaryDirectory()
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self.temp_dir.__enter__()
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
try:
|
||||||
|
self.temp_dir.__exit__(exc_type, exc_val, exc_tb)
|
||||||
|
except (OSError, PermissionError):
|
||||||
|
pass # Ignore errors (Windows)
|
||||||
|
|
||||||
|
|
||||||
|
class ChdirTemporaryDirectory(IgnorantTemporaryDirectory):
|
||||||
|
def __init__(self):
|
||||||
|
try:
|
||||||
|
self.cwd = os.getcwd()
|
||||||
|
except FileNotFoundError:
|
||||||
|
self.cwd = None
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
res = super().__enter__()
|
||||||
|
os.chdir(self.temp_dir.name)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
if self.cwd:
|
||||||
|
try:
|
||||||
|
os.chdir(self.cwd)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
super().__exit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
||||||
|
|
||||||
|
class GitTemporaryDirectory(ChdirTemporaryDirectory):
|
||||||
|
def __enter__(self):
|
||||||
|
dname = super().__enter__()
|
||||||
|
self.repo = make_repo(dname)
|
||||||
|
return dname
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
del self.repo
|
||||||
|
super().__exit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
||||||
|
|
||||||
|
def make_repo(path=None):
|
||||||
|
if not path:
|
||||||
|
path = "."
|
||||||
|
repo = git.Repo.init(path)
|
||||||
|
repo.config_writer().set_value("user", "name", "Test User").release()
|
||||||
|
repo.config_writer().set_value("user", "email", "testuser@example.com").release()
|
||||||
|
|
||||||
|
return repo
|
||||||
|
|
||||||
def is_image_file(file_name):
|
def is_image_file(file_name):
|
||||||
"""
|
"""
|
||||||
|
|
1603
assets/benchmarks-udiff.svg
Normal file
1603
assets/benchmarks-udiff.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 40 KiB |
BIN
assets/udiffs.jpg
Normal file
BIN
assets/udiffs.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 84 KiB |
|
@ -32,7 +32,7 @@ from aider.io import InputOutput
|
||||||
|
|
||||||
BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks"))
|
BENCHMARK_DNAME = Path(os.environ.get("AIDER_BENCHMARK_DIR", "tmp.benchmarks"))
|
||||||
|
|
||||||
ORIGINAL_DNAME = BENCHMARK_DNAME / "exercism-python"
|
EXERCISES_DIR_DEFAULT = "exercism-python"
|
||||||
|
|
||||||
app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
|
app = typer.Typer(add_completion=False, pretty_exceptions_enable=False)
|
||||||
|
|
||||||
|
@ -69,23 +69,25 @@ def show_stats(dirnames, graphs):
|
||||||
# remember this row, so we can update it with the repeat_avg
|
# remember this row, so we can update it with the repeat_avg
|
||||||
repeat_row = len(rows)
|
repeat_row = len(rows)
|
||||||
|
|
||||||
gpt35 = "gpt-3.5-turbo"
|
# gpt35 = "gpt-3.5-turbo"
|
||||||
gpt4 = "gpt-4"
|
# gpt4 = "gpt-4"
|
||||||
|
# if row.model.startswith(gpt35):
|
||||||
|
# row.model = gpt35 + "\n" + row.model[len(gpt35) :]
|
||||||
|
# elif row.model.startswith(gpt4):
|
||||||
|
# row.model = gpt4 + "\n" + row.model[len(gpt4) :]
|
||||||
|
|
||||||
if row.model.startswith(gpt35):
|
if "folk" in row.dir_name:
|
||||||
row.model = gpt35 + "\n" + row.model[len(gpt35) :]
|
row.edit_format = "folk"
|
||||||
elif row.model.startswith(gpt4):
|
|
||||||
row.model = gpt4 + "\n" + row.model[len(gpt4) :]
|
|
||||||
|
|
||||||
# if row.model == "gpt-4\n-1106-preview":
|
if row.model == "gpt-4-0613":
|
||||||
# row.model += "\n(preliminary)"
|
row.model += "\n(8k context window is\ntoo small for benchmark)"
|
||||||
|
|
||||||
if row.completed_tests < 133:
|
if row.completed_tests < 133:
|
||||||
print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
|
print(f"Warning: {row.dir_name} is incomplete: {row.completed_tests}")
|
||||||
|
|
||||||
if "repeat" in row.dir_name:
|
# if "repeat" in row.dir_name:
|
||||||
repeats.append(vars(row))
|
# repeats.append(vars(row))
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
kind = (row.model, row.edit_format)
|
kind = (row.model, row.edit_format)
|
||||||
if kind in seen:
|
if kind in seen:
|
||||||
|
@ -97,6 +99,7 @@ def show_stats(dirnames, graphs):
|
||||||
rows.append(vars(row))
|
rows.append(vars(row))
|
||||||
|
|
||||||
if repeats:
|
if repeats:
|
||||||
|
dump(repeats)
|
||||||
extra = rows[repeat_row]
|
extra = rows[repeat_row]
|
||||||
dump(extra)
|
dump(extra)
|
||||||
repeats.append(extra)
|
repeats.append(extra)
|
||||||
|
@ -115,15 +118,16 @@ def show_stats(dirnames, graphs):
|
||||||
# use the average in the main bar
|
# use the average in the main bar
|
||||||
rows[repeat_row]["pass_rate_2"] = repeat_avg
|
rows[repeat_row]["pass_rate_2"] = repeat_avg
|
||||||
else:
|
else:
|
||||||
repeat_hi = repeat_lo = repeat_avg = None
|
repeat_hi = repeat_lo = repeat_avg = None # noqa: F841
|
||||||
|
|
||||||
df = pd.DataFrame.from_records(rows)
|
df = pd.DataFrame.from_records(rows)
|
||||||
df.sort_values(by=["model", "edit_format"], inplace=True)
|
df.sort_values(by=["model", "edit_format"], inplace=True)
|
||||||
|
|
||||||
# dump(df)
|
# dump(df)
|
||||||
if graphs:
|
if graphs:
|
||||||
plot_timing(df)
|
# plot_timing(df)
|
||||||
plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
|
# plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg)
|
||||||
|
plot_refactoring(df)
|
||||||
|
|
||||||
|
|
||||||
def plot_timing(df):
|
def plot_timing(df):
|
||||||
|
@ -282,6 +286,88 @@ def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
|
||||||
# df.to_csv("tmp.benchmarks.csv")
|
# df.to_csv("tmp.benchmarks.csv")
|
||||||
|
|
||||||
|
|
||||||
|
def plot_refactoring(df):
|
||||||
|
tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]
|
||||||
|
|
||||||
|
plt.rcParams["hatch.linewidth"] = 0.5
|
||||||
|
plt.rcParams["hatch.color"] = "#444444"
|
||||||
|
|
||||||
|
from matplotlib import rc
|
||||||
|
|
||||||
|
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(6, 4))
|
||||||
|
ax.grid(axis="y", zorder=0, lw=0.2)
|
||||||
|
|
||||||
|
zorder = 1
|
||||||
|
for grouped in tries:
|
||||||
|
zorder += 1
|
||||||
|
df = grouped.unstack()
|
||||||
|
num_models, num_formats = df.shape
|
||||||
|
|
||||||
|
pos = np.array(range(num_models))
|
||||||
|
width = 0.8 / num_formats
|
||||||
|
|
||||||
|
formats = df.columns
|
||||||
|
models = df.index
|
||||||
|
|
||||||
|
for i, fmt in enumerate(formats):
|
||||||
|
hatch = ""
|
||||||
|
|
||||||
|
if fmt == "diff":
|
||||||
|
color = "#b3e6a8"
|
||||||
|
label = "Baseline (search/replace blocks)"
|
||||||
|
elif fmt == "udiff":
|
||||||
|
color = "#b3d1e6"
|
||||||
|
label = "Unified diffs"
|
||||||
|
elif fmt == "folk":
|
||||||
|
label = "Prompt with blind, no hands, tip $2000, etc"
|
||||||
|
color = "#b3e6a8"
|
||||||
|
hatch = "////"
|
||||||
|
|
||||||
|
if zorder > 1:
|
||||||
|
edge = dict(
|
||||||
|
edgecolor="#ffffff",
|
||||||
|
linewidth=1.5,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
edge = dict()
|
||||||
|
if zorder == 2:
|
||||||
|
edge["label"] = label
|
||||||
|
|
||||||
|
rects = ax.bar(
|
||||||
|
pos + i * width,
|
||||||
|
df[fmt],
|
||||||
|
width * 0.95,
|
||||||
|
color=color,
|
||||||
|
hatch=hatch,
|
||||||
|
zorder=zorder,
|
||||||
|
**edge,
|
||||||
|
)
|
||||||
|
|
||||||
|
if zorder == 2:
|
||||||
|
ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)
|
||||||
|
|
||||||
|
ax.set_xticks([p + 1.0 * width for p in pos])
|
||||||
|
ax.set_xticklabels(models)
|
||||||
|
|
||||||
|
ax.set_ylabel("Percent of exercises completed successfully")
|
||||||
|
# ax.set_xlabel("Model")
|
||||||
|
ax.set_title('Refactoring "Laziness" Benchmark\n(percent coding tasks correct)')
|
||||||
|
ax.legend(
|
||||||
|
# title="Edit Format",
|
||||||
|
loc="upper left",
|
||||||
|
# bbox_to_anchor=(0.95, 0.95),
|
||||||
|
)
|
||||||
|
ax.set_ylim(top=100)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig("tmp.svg")
|
||||||
|
imgcat(fig)
|
||||||
|
|
||||||
|
# df.to_csv("tmp.benchmarks.csv")
|
||||||
|
|
||||||
|
|
||||||
def resolve_dirname(dirname, use_single_prior, make_new):
|
def resolve_dirname(dirname, use_single_prior, make_new):
|
||||||
if len(dirname.parts) > 1:
|
if len(dirname.parts) > 1:
|
||||||
return dirname
|
return dirname
|
||||||
|
@ -313,6 +399,16 @@ def main(
|
||||||
graphs: bool = typer.Option(False, "--graphs", help="Generate graphs"),
|
graphs: bool = typer.Option(False, "--graphs", help="Generate graphs"),
|
||||||
model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"),
|
model: str = typer.Option("gpt-3.5-turbo", "--model", "-m", help="Model name"),
|
||||||
edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"),
|
edit_format: str = typer.Option(None, "--edit-format", "-e", help="Edit format"),
|
||||||
|
replay: str = typer.Option(
|
||||||
|
None,
|
||||||
|
"--replay",
|
||||||
|
help="Replay previous .aider.chat.history.md responses from previous benchmark run",
|
||||||
|
),
|
||||||
|
max_apply_update_errors: int = typer.Option(
|
||||||
|
3,
|
||||||
|
"--max-apply-update-errors",
|
||||||
|
help="Maximum number of apply update errors before stopping the test",
|
||||||
|
),
|
||||||
keywords: str = typer.Option(
|
keywords: str = typer.Option(
|
||||||
None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)"
|
None, "--keywords", "-k", help="Only run tests that contain keywords (comma sep)"
|
||||||
),
|
),
|
||||||
|
@ -331,6 +427,9 @@ def main(
|
||||||
tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"),
|
tries: int = typer.Option(2, "--tries", "-r", help="Number of tries for running tests"),
|
||||||
threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"),
|
threads: int = typer.Option(1, "--threads", "-t", help="Number of threads to run in parallel"),
|
||||||
num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"),
|
num_tests: int = typer.Option(-1, "--num-tests", "-n", help="Number of tests to run"),
|
||||||
|
exercises_dir: str = typer.Option(
|
||||||
|
EXERCISES_DIR_DEFAULT, "--exercises-dir", help="Directory with exercise files"
|
||||||
|
),
|
||||||
):
|
):
|
||||||
repo = git.Repo(search_parent_directories=True)
|
repo = git.Repo(search_parent_directories=True)
|
||||||
commit_hash = repo.head.object.hexsha[:7]
|
commit_hash = repo.head.object.hexsha[:7]
|
||||||
|
@ -363,12 +462,13 @@ def main(
|
||||||
return
|
return
|
||||||
|
|
||||||
assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir(), BENCHMARK_DNAME
|
assert BENCHMARK_DNAME.exists() and BENCHMARK_DNAME.is_dir(), BENCHMARK_DNAME
|
||||||
assert ORIGINAL_DNAME.exists() and ORIGINAL_DNAME.is_dir(), ORIGINAL_DNAME
|
original_dname = BENCHMARK_DNAME / exercises_dir
|
||||||
|
assert original_dname.exists() and original_dname.is_dir(), original_dname
|
||||||
|
|
||||||
if clean and dirname.exists():
|
if clean and dirname.exists():
|
||||||
print("Cleaning up and replacing", dirname)
|
print("Cleaning up and replacing", dirname)
|
||||||
dir_files = set(fn.name for fn in dirname.glob("*"))
|
dir_files = set(fn.name for fn in dirname.glob("*"))
|
||||||
original_files = set(fn.name for fn in ORIGINAL_DNAME.glob("*"))
|
original_files = set(fn.name for fn in original_dname.glob("*"))
|
||||||
if dir_files != original_files:
|
if dir_files != original_files:
|
||||||
print("ERROR: will not delete dir that does not look like original tests", dirname)
|
print("ERROR: will not delete dir that does not look like original tests", dirname)
|
||||||
return
|
return
|
||||||
|
@ -381,8 +481,8 @@ def main(
|
||||||
dirname.rename(dest)
|
dirname.rename(dest)
|
||||||
|
|
||||||
if not dirname.exists():
|
if not dirname.exists():
|
||||||
print(f"Copying {ORIGINAL_DNAME} -> {dirname} ...")
|
print(f"Copying {original_dname} -> {dirname} ...")
|
||||||
shutil.copytree(ORIGINAL_DNAME, dirname)
|
shutil.copytree(original_dname, dirname)
|
||||||
print("...done")
|
print("...done")
|
||||||
|
|
||||||
test_dnames = sorted(os.listdir(dirname))
|
test_dnames = sorted(os.listdir(dirname))
|
||||||
|
@ -399,6 +499,7 @@ def main(
|
||||||
all_results = []
|
all_results = []
|
||||||
for testname in test_dnames:
|
for testname in test_dnames:
|
||||||
results = run_test(
|
results = run_test(
|
||||||
|
original_dname,
|
||||||
dirname / testname,
|
dirname / testname,
|
||||||
model,
|
model,
|
||||||
edit_format,
|
edit_format,
|
||||||
|
@ -407,6 +508,8 @@ def main(
|
||||||
no_aider,
|
no_aider,
|
||||||
verbose,
|
verbose,
|
||||||
commit_hash,
|
commit_hash,
|
||||||
|
replay,
|
||||||
|
max_apply_update_errors,
|
||||||
)
|
)
|
||||||
|
|
||||||
all_results.append(results)
|
all_results.append(results)
|
||||||
|
@ -415,6 +518,7 @@ def main(
|
||||||
run_test_threaded = lox.thread(threads)(run_test)
|
run_test_threaded = lox.thread(threads)(run_test)
|
||||||
for testname in test_dnames:
|
for testname in test_dnames:
|
||||||
run_test_threaded.scatter(
|
run_test_threaded.scatter(
|
||||||
|
original_dname,
|
||||||
dirname / testname,
|
dirname / testname,
|
||||||
model,
|
model,
|
||||||
edit_format,
|
edit_format,
|
||||||
|
@ -423,6 +527,8 @@ def main(
|
||||||
no_aider,
|
no_aider,
|
||||||
verbose,
|
verbose,
|
||||||
commit_hash,
|
commit_hash,
|
||||||
|
replay,
|
||||||
|
max_apply_update_errors,
|
||||||
)
|
)
|
||||||
all_results = run_test_threaded.gather(tqdm=True)
|
all_results = run_test_threaded.gather(tqdm=True)
|
||||||
|
|
||||||
|
@ -467,6 +573,7 @@ def show_diffs(dirnames):
|
||||||
changed = set(testcases) - unchanged
|
changed = set(testcases) - unchanged
|
||||||
print()
|
print()
|
||||||
print("changed:", len(changed), ",".join(sorted(changed)))
|
print("changed:", len(changed), ",".join(sorted(changed)))
|
||||||
|
print()
|
||||||
print("unchanged:", len(unchanged), ",".join(sorted(unchanged)))
|
print("unchanged:", len(unchanged), ",".join(sorted(unchanged)))
|
||||||
|
|
||||||
|
|
||||||
|
@ -498,6 +605,10 @@ def summarize_results(dirname):
|
||||||
res.user_asks = 0
|
res.user_asks = 0
|
||||||
res.test_timeouts = 0
|
res.test_timeouts = 0
|
||||||
res.exhausted_context_windows = 0
|
res.exhausted_context_windows = 0
|
||||||
|
res.num_malformed_responses = 0
|
||||||
|
res.syntax_errors = 0
|
||||||
|
res.indentation_errors = 0
|
||||||
|
res.lazy_comments = 0
|
||||||
|
|
||||||
variants = defaultdict(set)
|
variants = defaultdict(set)
|
||||||
|
|
||||||
|
@ -518,6 +629,11 @@ def summarize_results(dirname):
|
||||||
res.error_outputs += results.get("num_error_outputs", 0)
|
res.error_outputs += results.get("num_error_outputs", 0)
|
||||||
res.user_asks += results.get("num_user_asks", 0)
|
res.user_asks += results.get("num_user_asks", 0)
|
||||||
res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0)
|
res.exhausted_context_windows += results.get("num_exhausted_context_windows", 0)
|
||||||
|
res.num_malformed_responses += results.get("num_malformed_responses", 0)
|
||||||
|
res.lazy_comments += results.get("lazy_comments", 0)
|
||||||
|
|
||||||
|
res.syntax_errors += results.get("syntax_errors", 0)
|
||||||
|
res.indentation_errors += results.get("indentation_errors", 0)
|
||||||
|
|
||||||
for key in "model edit_format commit_hash".split():
|
for key in "model edit_format commit_hash".split():
|
||||||
val = results.get(key)
|
val = results.get(key)
|
||||||
|
@ -526,6 +642,9 @@ def summarize_results(dirname):
|
||||||
if not res.completed_tests:
|
if not res.completed_tests:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# if res.completed_tests < 133:
|
||||||
|
# return
|
||||||
|
|
||||||
console = Console(highlight=False)
|
console = Console(highlight=False)
|
||||||
console.rule(title=str(dirname))
|
console.rule(title=str(dirname))
|
||||||
|
|
||||||
|
@ -538,14 +657,22 @@ def summarize_results(dirname):
|
||||||
val = ", ".join(map(str, val))
|
val = ", ".join(map(str, val))
|
||||||
setattr(res, key, val)
|
setattr(res, key, val)
|
||||||
console.print(f"{key}: {val}", style=style)
|
console.print(f"{key}: {val}", style=style)
|
||||||
print("num_error_outputs:", res.error_outputs)
|
|
||||||
print("num_user_asks:", res.user_asks)
|
|
||||||
|
|
||||||
style = "red" if res.exhausted_context_windows else None
|
def show(stat):
|
||||||
console.print("num_exhausted_context_windows", res.exhausted_context_windows, style=style)
|
val = getattr(res, stat)
|
||||||
|
style = "red" if val else None
|
||||||
|
console.print(f"{stat}: {val}", style=style)
|
||||||
|
|
||||||
style = "red" if res.test_timeouts else None
|
console.print()
|
||||||
console.print("test_timeouts:", res.test_timeouts, style=style)
|
show("error_outputs")
|
||||||
|
show("user_asks")
|
||||||
|
show("lazy_comments")
|
||||||
|
show("num_malformed_responses")
|
||||||
|
show("syntax_errors")
|
||||||
|
show("indentation_errors")
|
||||||
|
console.print()
|
||||||
|
show("exhausted_context_windows")
|
||||||
|
show("test_timeouts")
|
||||||
|
|
||||||
console.print()
|
console.print()
|
||||||
for i in range(tries):
|
for i in range(tries):
|
||||||
|
@ -573,8 +700,35 @@ def summarize_results(dirname):
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def get_replayed_content(replay_dname, test_dname):
|
||||||
|
replay_dname = Path(replay_dname)
|
||||||
|
test_dname = Path(test_dname)
|
||||||
|
dump(replay_dname, test_dname)
|
||||||
|
|
||||||
|
test_name = test_dname.name
|
||||||
|
replay_fname = replay_dname / test_name / ".aider.chat.history.md"
|
||||||
|
dump(replay_fname)
|
||||||
|
|
||||||
|
res = replay_fname.read_text()
|
||||||
|
return res
|
||||||
|
|
||||||
|
res = res.splitlines(keepends=True)
|
||||||
|
res = [line for line in res if not line.startswith("> ") and not line.startswith("#### ")]
|
||||||
|
return "".join(res)
|
||||||
|
|
||||||
|
|
||||||
def run_test(
|
def run_test(
|
||||||
testdir, model_name, edit_format, tries, no_unit_tests, no_aider, verbose, commit_hash
|
original_dname,
|
||||||
|
testdir,
|
||||||
|
model_name,
|
||||||
|
edit_format,
|
||||||
|
tries,
|
||||||
|
no_unit_tests,
|
||||||
|
no_aider,
|
||||||
|
verbose,
|
||||||
|
commit_hash,
|
||||||
|
replay,
|
||||||
|
max_apply_update_errors,
|
||||||
):
|
):
|
||||||
if not os.path.isdir(testdir):
|
if not os.path.isdir(testdir):
|
||||||
print("Not a dir:", testdir)
|
print("Not a dir:", testdir)
|
||||||
|
@ -595,12 +749,17 @@ def run_test(
|
||||||
|
|
||||||
fnames = []
|
fnames = []
|
||||||
for fname in testdir.glob("*"):
|
for fname in testdir.glob("*"):
|
||||||
if "test" not in fname.name and fname.is_file() and fname.name[0] != ".":
|
if (
|
||||||
|
"test" not in fname.name
|
||||||
|
and fname.is_file()
|
||||||
|
and fname.name[0] != "."
|
||||||
|
and fname.suffix == ".py"
|
||||||
|
):
|
||||||
fnames.append(fname)
|
fnames.append(fname)
|
||||||
|
|
||||||
# restore the original file, in case we interrupted a prev run
|
# restore the original file, in case we interrupted a prev run
|
||||||
# after it had saved changes
|
# after it had saved changes
|
||||||
original_fname = ORIGINAL_DNAME / testdir.name / fname.name
|
original_fname = original_dname / testdir.name / fname.name
|
||||||
shutil.copy(original_fname, fname)
|
shutil.copy(original_fname, fname)
|
||||||
|
|
||||||
file_list = " ".join(fname.name for fname in fnames)
|
file_list = " ".join(fname.name for fname in fnames)
|
||||||
|
@ -644,17 +803,40 @@ def run_test(
|
||||||
pretty=False,
|
pretty=False,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
)
|
)
|
||||||
|
coder.max_apply_update_errors = max_apply_update_errors
|
||||||
|
|
||||||
timeouts = 0
|
timeouts = 0
|
||||||
|
|
||||||
|
syntax_errors = 0
|
||||||
|
indentation_errors = 0
|
||||||
|
lazy_comments = 0
|
||||||
|
|
||||||
dur = 0
|
dur = 0
|
||||||
test_outcomes = []
|
test_outcomes = []
|
||||||
for i in range(tries):
|
for i in range(tries):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
if not no_aider:
|
if no_aider:
|
||||||
coder.run(with_message=instructions)
|
pass
|
||||||
|
elif replay:
|
||||||
|
response = get_replayed_content(replay, testdir)
|
||||||
|
coder.partial_response_content = response
|
||||||
|
|
||||||
|
show = response.splitlines(keepends=True)
|
||||||
|
show = [">> " + line for line in show]
|
||||||
|
io.append_chat_history("".join(show))
|
||||||
|
|
||||||
|
coder.apply_updates()
|
||||||
|
else:
|
||||||
|
response = coder.run(with_message=instructions)
|
||||||
dur += time.time() - start
|
dur += time.time() - start
|
||||||
|
|
||||||
|
if not no_aider:
|
||||||
|
pat = r"^[+]? *[#].* [.][.][.] "
|
||||||
|
# Count the number of lines that match pat in response
|
||||||
|
dump(response)
|
||||||
|
lazy_comments += len(re.findall(pat, response, re.MULTILINE))
|
||||||
|
dump(lazy_comments)
|
||||||
|
|
||||||
if coder.last_keyboard_interrupt:
|
if coder.last_keyboard_interrupt:
|
||||||
raise KeyboardInterrupt
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
|
@ -673,7 +855,14 @@ def run_test(
|
||||||
test_outcomes.append(True)
|
test_outcomes.append(True)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if replay:
|
||||||
|
io.append_chat_history(errors)
|
||||||
|
|
||||||
errors = errors.splitlines()
|
errors = errors.splitlines()
|
||||||
|
|
||||||
|
syntax_errors += sum(1 for line in errors if line.startswith("SyntaxError"))
|
||||||
|
indentation_errors += sum(1 for line in errors if line.startswith("IndentationError"))
|
||||||
|
|
||||||
print(errors[-1])
|
print(errors[-1])
|
||||||
errors = errors[:50]
|
errors = errors[:50]
|
||||||
errors = "\n".join(errors)
|
errors = "\n".join(errors)
|
||||||
|
@ -693,6 +882,10 @@ def run_test(
|
||||||
num_error_outputs=io.num_error_outputs,
|
num_error_outputs=io.num_error_outputs,
|
||||||
num_user_asks=io.num_user_asks,
|
num_user_asks=io.num_user_asks,
|
||||||
num_exhausted_context_windows=coder.num_exhausted_context_windows,
|
num_exhausted_context_windows=coder.num_exhausted_context_windows,
|
||||||
|
num_malformed_responses=coder.num_malformed_responses,
|
||||||
|
syntax_errors=syntax_errors,
|
||||||
|
indentation_errors=indentation_errors,
|
||||||
|
lazy_comments=lazy_comments, # Add the count of pattern matches to the results
|
||||||
chat_hashes=list(
|
chat_hashes=list(
|
||||||
zip(
|
zip(
|
||||||
coder.chat_completion_call_hashes,
|
coder.chat_completion_call_hashes,
|
||||||
|
|
|
@ -2,9 +2,9 @@ instructions_addendum = """
|
||||||
####
|
####
|
||||||
|
|
||||||
Use the above instructions to modify the supplied files: {file_list}
|
Use the above instructions to modify the supplied files: {file_list}
|
||||||
Keep and implement the existing function or class stubs, they will be called from unit tests.
|
Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.
|
||||||
Only use standard python libraries, don't suggest installing any packages.
|
Only use standard python libraries, don't suggest installing any packages.
|
||||||
"""
|
""" # noqa: E501
|
||||||
|
|
||||||
|
|
||||||
test_failures = """
|
test_failures = """
|
||||||
|
|
208
benchmark/refactor_tools.py
Executable file
208
benchmark/refactor_tools.py
Executable file
|
@ -0,0 +1,208 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from aider.dump import dump # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
class ParentNodeTransformer(ast.NodeTransformer):
|
||||||
|
"""
|
||||||
|
This transformer sets the 'parent' attribute on each node.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def generic_visit(self, node):
|
||||||
|
for child in ast.iter_child_nodes(node):
|
||||||
|
child.parent = node
|
||||||
|
return super(ParentNodeTransformer, self).generic_visit(node)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_full_func_at_top_level(tree, func, func_children):
|
||||||
|
func_node = next(
|
||||||
|
(
|
||||||
|
item
|
||||||
|
for item in ast.walk(tree)
|
||||||
|
if isinstance(item, ast.FunctionDef) and item.name == func
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
assert func_node is not None, f"Function {func} not found"
|
||||||
|
|
||||||
|
assert isinstance(
|
||||||
|
func_node.parent, ast.Module
|
||||||
|
), f"{func} is not a top level function, it has parent {func_node.parent}"
|
||||||
|
|
||||||
|
num_children = sum(1 for _ in ast.walk(func_node))
|
||||||
|
pct_diff_children = abs(num_children - func_children) * 100 / func_children
|
||||||
|
assert (
|
||||||
|
pct_diff_children < 10
|
||||||
|
), f"Old method had {func_children} children, new method has {num_children}"
|
||||||
|
|
||||||
|
|
||||||
|
def verify_old_class_children(tree, old_class, old_class_children):
|
||||||
|
node = next(
|
||||||
|
(
|
||||||
|
item
|
||||||
|
for item in ast.walk(tree)
|
||||||
|
if isinstance(item, ast.ClassDef) and item.name == old_class
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
assert node is not None, f"Old class {old_class} not found"
|
||||||
|
|
||||||
|
num_children = sum(1 for _ in ast.walk(node))
|
||||||
|
|
||||||
|
pct_diff_children = abs(num_children - old_class_children) * 100 / old_class_children
|
||||||
|
assert (
|
||||||
|
pct_diff_children < 10
|
||||||
|
), f"Old class had {old_class_children} children, new class has {num_children}"
|
||||||
|
|
||||||
|
|
||||||
|
def verify_refactor(fname, func, func_children, old_class, old_class_children):
|
||||||
|
with open(fname, "r") as file:
|
||||||
|
file_contents = file.read()
|
||||||
|
tree = ast.parse(file_contents)
|
||||||
|
ParentNodeTransformer().visit(tree) # Set parent attribute for all nodes
|
||||||
|
|
||||||
|
verify_full_func_at_top_level(tree, func, func_children)
|
||||||
|
|
||||||
|
verify_old_class_children(tree, old_class, old_class_children - func_children)
|
||||||
|
|
||||||
|
|
||||||
|
############################
|
||||||
|
|
||||||
|
|
||||||
|
class SelfUsageChecker(ast.NodeVisitor):
|
||||||
|
def __init__(self):
|
||||||
|
self.non_self_methods = []
|
||||||
|
self.parent_class_name = None
|
||||||
|
self.num_class_children = 0
|
||||||
|
|
||||||
|
def visit_FunctionDef(self, node):
|
||||||
|
# Check if the first argument is 'self' and if it's not used
|
||||||
|
if node.args.args and node.args.args[0].arg == "self":
|
||||||
|
self_used = any(
|
||||||
|
isinstance(expr, ast.Name) and expr.id == "self"
|
||||||
|
for stmt in node.body
|
||||||
|
for expr in ast.walk(stmt)
|
||||||
|
)
|
||||||
|
super_used = any(
|
||||||
|
isinstance(expr, ast.Name) and expr.id == "super"
|
||||||
|
for stmt in node.body
|
||||||
|
for expr in ast.walk(stmt)
|
||||||
|
)
|
||||||
|
if not self_used and not super_used:
|
||||||
|
# Calculate the number of child nodes in the function
|
||||||
|
num_child_nodes = sum(1 for _ in ast.walk(node))
|
||||||
|
res = (
|
||||||
|
self.parent_class_name,
|
||||||
|
node.name,
|
||||||
|
self.num_class_children,
|
||||||
|
num_child_nodes,
|
||||||
|
)
|
||||||
|
self.non_self_methods.append(res)
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def visit_ClassDef(self, node):
|
||||||
|
self.parent_class_name = node.name
|
||||||
|
self.num_class_children = sum(1 for _ in ast.walk(node))
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
|
||||||
|
def find_python_files(path):
|
||||||
|
if os.path.isfile(path) and path.endswith(".py"):
|
||||||
|
return [path]
|
||||||
|
elif os.path.isdir(path):
|
||||||
|
py_files = []
|
||||||
|
for root, dirs, files in os.walk(path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".py"):
|
||||||
|
full_path = os.path.join(root, file)
|
||||||
|
py_files.append(full_path)
|
||||||
|
return py_files
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def find_non_self_methods(path):
|
||||||
|
python_files = find_python_files(path)
|
||||||
|
non_self_methods = []
|
||||||
|
for filename in python_files:
|
||||||
|
with open(filename, "r") as file:
|
||||||
|
node = ast.parse(file.read(), filename=filename)
|
||||||
|
checker = SelfUsageChecker()
|
||||||
|
checker.visit(node)
|
||||||
|
for method in checker.non_self_methods:
|
||||||
|
non_self_methods.append([filename] + list(method))
|
||||||
|
|
||||||
|
return non_self_methods
|
||||||
|
|
||||||
|
|
||||||
|
def process(entry):
|
||||||
|
fname, class_name, method_name, class_children, method_children = entry
|
||||||
|
if method_children > class_children / 2:
|
||||||
|
return
|
||||||
|
if method_children < 100:
|
||||||
|
return
|
||||||
|
|
||||||
|
fname = Path(fname)
|
||||||
|
if "test" in fname.stem:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"{fname} {class_name} {method_name} {class_children} {method_children}")
|
||||||
|
|
||||||
|
dname = Path("tmp.benchmarks/refactor-benchmark")
|
||||||
|
dname.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
dname = dname / f"{fname.stem}_{class_name}_{method_name}"
|
||||||
|
dname.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
shutil.copy(fname, dname / fname.name)
|
||||||
|
|
||||||
|
docs_dname = dname / ".docs"
|
||||||
|
docs_dname.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
ins_fname = docs_dname / "instructions.md"
|
||||||
|
ins_fname.write_text(f"""# Refactor {class_name}.{method_name}
|
||||||
|
|
||||||
|
Refactor the `{method_name}` method in the `{class_name}` class to be a stand alone, top level function.
|
||||||
|
Name the new function `{method_name}`, exactly the same name as the existing method.
|
||||||
|
Update any existing `self.{method_name}` calls to work with the new `{method_name}` function.
|
||||||
|
""") # noqa: E501
|
||||||
|
|
||||||
|
test_fname = dname / f"{fname.stem}_test.py"
|
||||||
|
test_fname.write_text(f"""
|
||||||
|
import unittest
|
||||||
|
from benchmark.refactor_tools import verify_refactor
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class TheTest(unittest.TestCase):
|
||||||
|
def test_{method_name}(self):
|
||||||
|
fname = Path(__file__).parent / "{fname.name}"
|
||||||
|
method = "{method_name}"
|
||||||
|
method_children = {method_children}
|
||||||
|
|
||||||
|
class_name = "{class_name}"
|
||||||
|
class_children = {class_children}
|
||||||
|
|
||||||
|
verify_refactor(fname, method, method_children, class_name, class_children)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def main(paths):
|
||||||
|
for path in paths:
|
||||||
|
methods = find_non_self_methods(path)
|
||||||
|
# methods = sorted(methods, key=lambda x: x[4])
|
||||||
|
|
||||||
|
for method in methods:
|
||||||
|
process(method)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(sys.argv[1:])
|
|
@ -149,7 +149,6 @@ urllib3==2.1.0
|
||||||
virtualenv==20.25.0
|
virtualenv==20.25.0
|
||||||
# via pre-commit
|
# via pre-commit
|
||||||
wheel==0.42.0
|
wheel==0.42.0
|
||||||
# via pip-tools
|
|
||||||
|
|
||||||
# The following packages are considered to be unsafe in a requirements file:
|
# The following packages are considered to be unsafe in a requirements file:
|
||||||
# pip
|
# pip
|
||||||
|
|
374
docs/unified-diffs.md
Normal file
374
docs/unified-diffs.md
Normal file
|
@ -0,0 +1,374 @@
|
||||||
|
|
||||||
|
# Fixing GPT-4 Turbo laziness with unified diffs
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
Aider now asks GPT-4 Turbo to use
|
||||||
|
[unified diffs](https://www.gnu.org/software/diffutils/manual/html_node/Example-Unified.html)
|
||||||
|
to edit your code.
|
||||||
|
This massively reduces GPT-4 Turbo's bad habit of "lazy" coding,
|
||||||
|
where it writes half completed code filled with comments
|
||||||
|
like "...add logic here...".
|
||||||
|
|
||||||
|
Aider also has a new benchmarking suite
|
||||||
|
designed to both provoke and quantify lazy coding.
|
||||||
|
It consists of
|
||||||
|
39 python refactoring tasks,
|
||||||
|
which tend to make GPT-4 Turbo very lazy,
|
||||||
|
often resulting in comments like
|
||||||
|
"...include the original method body...".
|
||||||
|
|
||||||
|
This new laziness benchmark produced the following results with `gpt-4-1106-preview`:
|
||||||
|
|
||||||
|
- **GPT-4 Turbo only scored 15% as a baseline** using aider's existing "SEARCH/REPLACE block" edit format.
|
||||||
|
- **Aider's new unified diff edit format raised the score to 62%**.
|
||||||
|
- **No benefit from the user being blind, without hands, tipping $2000 or fearing truncated code trauma.** These widely circulated folk remedies performed no better than baseline when added to the system prompt with aider's SEARCH/REPLACE edit format. Including *all* of them still only scored at 15%
|
||||||
|
|
||||||
|
The older `gpt-4-0613` also did better on the laziness benchmark using unified diffs:
|
||||||
|
|
||||||
|
- **The June GPT-4's baseline was 26%** using aider's existing "SEARCH/REPLACE block" edit format.
|
||||||
|
- **Aider's new unified diff edit format raised June GPT-4's score to 59%**.
|
||||||
|
- The benchmark was designed to use large files, and
|
||||||
|
28% of them are too large to fit in June GPT-4's 8k context window.
|
||||||
|
This significantly harmed the benchmark results.
|
||||||
|
|
||||||
|
Before settling on unified diffs,
|
||||||
|
I explored many other approaches including:
|
||||||
|
prompts about being tireless and diligent,
|
||||||
|
OpenAI's function/tool calling capabilities,
|
||||||
|
numerous variations on aider's existing editing formats,
|
||||||
|
line number based formats
|
||||||
|
and other diff-like formats.
|
||||||
|
The results shared here reflect
|
||||||
|
an extensive investigation and benchmark evaluations of many approaches.
|
||||||
|
|
||||||
|
Aider's new unified diff editing format
|
||||||
|
outperforms other solutions by a wide margin.
|
||||||
|
The rest of this article will describe
|
||||||
|
aider's new editing format and refactoring benchmark.
|
||||||
|
It will highlight some key design decisions,
|
||||||
|
and evaluate their significance using ablation experiments.
|
||||||
|
|
||||||
|
|
||||||
|
## Unified diff editing format
|
||||||
|
|
||||||
|
The design and implementation of aider's new unified diff editing format
|
||||||
|
helped clarify some general principles
|
||||||
|
for GPT-4 code editing:
|
||||||
|
|
||||||
|
- FAMILIAR - Choose an edit format that GPT is already familiar with.
|
||||||
|
- SIMPLE - Choose a simple format that avoids escaping, syntactic overhead and brittle specifiers like line numbers or line counts.
|
||||||
|
- HIGH LEVEL - Encourage GPT to structure edits as new versions of substantive code blocks (functions, methods, etc), not as a series of surgical/minimal changes to individual lines of code.
|
||||||
|
- FLEXIBLE - Strive to be maximally flexible when interpreting GPT's edit instructions.
|
||||||
|
|
||||||
|
A helpful shortcut here is to have empathy for GPT, and imagine you
|
||||||
|
are the one being asked to specify code edits.
|
||||||
|
Would you want to hand type a properly escaped json data structure
|
||||||
|
to invoke surgical insert, delete, replace operations on specific code line numbers?
|
||||||
|
How would you feel about any mistake causing all your work to be discarded?
|
||||||
|
|
||||||
|
GPT is quantitatively better at code editing when you reduce the
|
||||||
|
burden of formatting edits by using a familiar, simple, high level
|
||||||
|
and flexible editing format.
|
||||||
|
|
||||||
|
### Choose a familiar editing format
|
||||||
|
|
||||||
|
Unified diffs are perhaps the most common way to show
|
||||||
|
code edits, because it's the
|
||||||
|
default output format of `git diff`:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
--- a/hello.py
|
||||||
|
+++ b/hello.py
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
def main(args):
|
||||||
|
# show a greeting
|
||||||
|
- print("Hello!")
|
||||||
|
+ print("Goodbye!")
|
||||||
|
return
|
||||||
|
```
|
||||||
|
|
||||||
|
Choosing such a popular format means that GPT has
|
||||||
|
seen *many* examples in its training data.
|
||||||
|
It's been trained to generate
|
||||||
|
text that conforms to the unified diff syntax.
|
||||||
|
|
||||||
|
Unified diffs are
|
||||||
|
usually intended to be consumed by the
|
||||||
|
[patch](https://www.gnu.org/software/diffutils/manual/html_node/Merging-with-patch.html)
|
||||||
|
program.
|
||||||
|
They need to *accurately* reflect the original and updated file contents,
|
||||||
|
otherwise the patch command will fail.
|
||||||
|
Having GPT specify changes in a format that is usually consumed by a
|
||||||
|
rigid program like patch
|
||||||
|
seems to encourage rigor.
|
||||||
|
GPT is less likely to
|
||||||
|
leave informal editing instructions in comments
|
||||||
|
or be lazy about writing all the needed code.
|
||||||
|
|
||||||
|
With unified diffs, GPT acts more like it's writing textual data intended to be read by a program,
|
||||||
|
not talking to a person.
|
||||||
|
|
||||||
|
|
||||||
|
### Use a simple editing format
|
||||||
|
|
||||||
|
Aider's [previous benchmark results](https://aider.chat/docs/benchmarks.html) made
|
||||||
|
it clear that simple editing formats
|
||||||
|
work best.
|
||||||
|
Even though OpenAI provides extensive support for
|
||||||
|
structured formats like json and function calls,
|
||||||
|
GPT is worse at editing code if you use them.
|
||||||
|
I repeated these and other similar benchmarks against GPT-4 Turbo,
|
||||||
|
and again reached these same conclusions.
|
||||||
|
|
||||||
|
Informally, this is probably because stuffing *source code* into JSON is complicated
|
||||||
|
and error prone.
|
||||||
|
Wrapping the python code
|
||||||
|
`print("On Windows use \"C:\\\"")`
|
||||||
|
as valid json is pretty painful and error prone.
|
||||||
|
Due to escaping issues GPT's code is often syntactically incorrect when it's
|
||||||
|
unpacked from JSON,
|
||||||
|
or the JSON decode just fails entirely.
|
||||||
|
|
||||||
|
On the other hand, the core of the unified diff format is very simple.
|
||||||
|
You include a hunk of the file that needs to be changed,
|
||||||
|
with every line prefixed by a character
|
||||||
|
to indicate unchanged, new or deleted lines.
|
||||||
|
A unified diff looks pretty much like the code it is modifying.
|
||||||
|
|
||||||
|
The one complicated piece is the line numbers found at the start
|
||||||
|
of each hunk. They look something like this: `@@ -2,4 +3,5 @@`.
|
||||||
|
GPT is terrible at working with source code line numbers.
|
||||||
|
This is a general observation about *any* use of line
|
||||||
|
numbers in editing formats,
|
||||||
|
backed up by many quantitative benchmark experiments.
|
||||||
|
|
||||||
|
You've probably ignored the line numbers in every diff you've seen,
|
||||||
|
because the diffs usually still make sense without them.
|
||||||
|
Aider tells GPT not to include line numbers,
|
||||||
|
and just interprets each hunk from the unified diffs
|
||||||
|
as a search and replace operation:
|
||||||
|
|
||||||
|
This diff:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
@@ ... @@
|
||||||
|
def main(args):
|
||||||
|
# show a greeting
|
||||||
|
- print("Hello!")
|
||||||
|
+ print("Goodbye!")
|
||||||
|
return
|
||||||
|
```
|
||||||
|
|
||||||
|
Means we need to search the file for the
|
||||||
|
*space* and *minus* `-` lines:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def main(args):
|
||||||
|
# show a greeting
|
||||||
|
print("Hello!")
|
||||||
|
return
|
||||||
|
```
|
||||||
|
|
||||||
|
And replace them with the *space* and *plus* `+` lines:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def main(args):
|
||||||
|
# show a greeting
|
||||||
|
print("Goodbye!")
|
||||||
|
return
|
||||||
|
```
|
||||||
|
|
||||||
|
Simple, right?
|
||||||
|
|
||||||
|
### Encourage high level edits
|
||||||
|
|
||||||
|
The example unified diffs we've seen so far have all been single line changes,
|
||||||
|
which makes them pretty easy to read and understand.
|
||||||
|
Consider this slightly more complex change, which renames the variable `n` to
|
||||||
|
`number`:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
@@ ... @@
|
||||||
|
-def factorial(n):
|
||||||
|
+def factorial(number):
|
||||||
|
- if n == 0:
|
||||||
|
+ if number == 0:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
- return n * factorial(n-1)
|
||||||
|
+ return number * factorial(number-1)
|
||||||
|
```
|
||||||
|
|
||||||
|
The following "high level diff" of the same
|
||||||
|
change is not as succinct as the minimal diff above,
|
||||||
|
but it is much easier to see two different coherent versions of the
|
||||||
|
`factorial()` function.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
@@ ... @@
|
||||||
|
-def factorial(n):
|
||||||
|
- if n == 0:
|
||||||
|
- return 1
|
||||||
|
- else:
|
||||||
|
- return n * factorial(n-1)
|
||||||
|
+def factorial(number):
|
||||||
|
+ if number == 0:
|
||||||
|
+ return 1
|
||||||
|
+ else:
|
||||||
|
+ return number * factorial(number-1)
|
||||||
|
```
|
||||||
|
|
||||||
|
Aider's system prompt encourages
|
||||||
|
GPT to produce these high level diffs.
|
||||||
|
This makes GPT better at producing correct diffs, which can be successfully
|
||||||
|
applied to the original file.
|
||||||
|
|
||||||
|
**Experiments without "high level diff" prompting
|
||||||
|
produce a 30-50% increase in editing errors,**
|
||||||
|
where diffs fail to apply or apply incorrectly and
|
||||||
|
produce invalid code.
|
||||||
|
When a patch fails, aider needs to ask GPT for a corrected version of the diff.
|
||||||
|
This takes time, costs tokens and sometimes fails to produce a successful edit
|
||||||
|
even after multiple retries.
|
||||||
|
|
||||||
|
There are probably a couple of reasons why high level diffs
|
||||||
|
help:
|
||||||
|
|
||||||
|
- It's easier to produce diffs that both correctly match the original code and correctly produce the intended new code. There is less risk of GPT getting confused, compared to generating a series of surgical edits mixed into existing code.
|
||||||
|
- High level hunks often contain more lines than a surgical hunk, so they are less likely to accidentally match unrelated parts of code. This is helpful because GPT can't reliably give us line numbers to specify exactly where in the file to make changes.
|
||||||
|
|
||||||
|
### Be flexible when applying edits
|
||||||
|
|
||||||
|
GPT frequently makes imperfect diffs that won't apply cleanly.
|
||||||
|
They exhibit a variety of problems:
|
||||||
|
|
||||||
|
- GPT forgets things like comments, docstrings, blank lines, etc. Or it skips over some code that it doesn't intend to change.
|
||||||
|
- GPT forgets the leading *plus* `+` character to mark novel lines that it wants to add to the file. It incorrectly includes them with a leading *space* as if they were already there.
|
||||||
|
- GPT jumps ahead to show edits to a different part of the file without starting a new hunk with a `@@ ... @@` divider.
|
||||||
|
|
||||||
|
As an example of the first issue, consider this source code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
# show a greeting
|
||||||
|
print("Hello!")
|
||||||
|
return
|
||||||
|
|
||||||
|
main(sys.argv[1:])
|
||||||
|
```
|
||||||
|
|
||||||
|
**The diff below is missing the "show a greeting" comment line**,
|
||||||
|
and represents a common type of mistake GPT might make.
|
||||||
|
When we search for the *minus* `-` lines, we won't find them
|
||||||
|
in the original file
|
||||||
|
because of the missing comment.
|
||||||
|
|
||||||
|
|
||||||
|
```diff
|
||||||
|
@@ ... @@
|
||||||
|
-def main(args):
|
||||||
|
- print("Hello!")
|
||||||
|
- return
|
||||||
|
+def main(args):
|
||||||
|
+ print("Goodbye!")
|
||||||
|
+ return
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Aider tries to be very flexible when applying diffs,
|
||||||
|
in order to handle defects.
|
||||||
|
If a hunk doesn't apply cleanly, aider uses a number of strategies:
|
||||||
|
|
||||||
|
- Normalize the hunk, by taking the *minus* `-` and *space* lines as one version of the hunk and the *space* and *plus* `+` lines as a second version and doing an actual unified diff on them.
|
||||||
|
- Try and discover new lines that GPT is trying to add but which it forgot to mark with *plus* `+` markers. This is done by diffing the *minus* `-` and *space* lines back against the original file.
|
||||||
|
- Break a large hunk apart into an overlapping sequence of smaller hunks, which each contain only one contiguous run of *plus* `+` and *minus* `-` lines. Try and apply each of these sub-hunks independently.
|
||||||
|
- Vary the size and offset of the "context window" of *space* lines from the hunk that are used to localize the edit to a specific part of the file.
|
||||||
|
- Combine the above mechanisms to progressively become more permissive about how to apply the hunk.
|
||||||
|
|
||||||
|
These flexible patching strategies are critical, and
|
||||||
|
removing them
|
||||||
|
radically increases the number of hunks which fail to apply.
|
||||||
|
|
||||||
|
**Experiments where flexible patching is disabled show**:
|
||||||
|
|
||||||
|
- **GPT-4 Turbo's performance drops from 65% down to 56%** on the refactoring benchmark.
|
||||||
|
- **A 9X increase in editing errors** on aider's original Exercism benchmark.
|
||||||
|
|
||||||
|
## Refactoring benchmark
|
||||||
|
|
||||||
|
Aider has long used a
|
||||||
|
[benchmark suite based on 133 Exercism python exercises]().
|
||||||
|
But these are mostly small coding problems,
|
||||||
|
usually requiring only a few dozen lines of code.
|
||||||
|
GPT-4 Turbo is typically only lazy on 2-3 of these exercises:
|
||||||
|
the ones with the most code and which involve refactoring.
|
||||||
|
|
||||||
|
Based on this observation, I set out to build a benchmark based on refactoring
|
||||||
|
a non-trivial amount of code found in fairly large files.
|
||||||
|
To do this, I used python's `ast` module to analyze the
|
||||||
|
[Django repository](https://github.com/django/django) to:
|
||||||
|
|
||||||
|
- Find source files that contain class methods which are non-trivial, having more than 100 AST nodes in their implementation.
|
||||||
|
- Focus on methods that are part of a larger class, which has at least twice as much code as the method itself.
|
||||||
|
- Find methods that don't use their `self` parameter, so they can be trivially refactored out of the class.
|
||||||
|
|
||||||
|
We can then turn each of these source files into a task for the benchmark,
|
||||||
|
where we ask GPT to do something like:
|
||||||
|
|
||||||
|
> Refactor the `_set_csrf_cookie` method in the `CsrfViewMiddleware` class to be a stand alone, top level function.
|
||||||
|
> Name the new function `_set_csrf_cookie`, exactly the same name as the existing method.
|
||||||
|
> Update any existing `self._set_csrf_cookie` calls to work with the new `_set_csrf_cookie` function.
|
||||||
|
|
||||||
|
A [simple python AST scanning script](https://github.com/paul-gauthier/aider/blob/main/benchmark/refactor_tools.py)
|
||||||
|
found 39 suitable files
|
||||||
|
and packaged them up as benchmark tasks.
|
||||||
|
Each task has a test
|
||||||
|
that checks if refactor
|
||||||
|
was performed roughly correctly:
|
||||||
|
|
||||||
|
- The updated source file must parse as valid python, to surface misapplied edits which corrupt the file.
|
||||||
|
- The target method must now exist as a top-level function in the file.
|
||||||
|
- This new top-level function must contain approximately the same number of AST nodes as the original class method. This ensures that GPT didn't elide code and replace it with comments.
|
||||||
|
- The original class must still be present in the file, and it must be smaller by about the number of AST nodes in the method which was removed. This helps confirm that the method was removed from the class, without other significant modifications.
|
||||||
|
|
||||||
|
To be clear, this is not a rigorous test that the refactor was performed correctly.
|
||||||
|
But it does serve as a basic sanity check that the refactor was essentially done as a cut & paste, without eliding any code as comments.
|
||||||
|
And it correlates well with other laziness metrics
|
||||||
|
gathered during benchmarking like the
|
||||||
|
introduction of new comments that contain "...".
|
||||||
|
|
||||||
|
The result is a pragmatic
|
||||||
|
[benchmark suite that provokes, detects and quantifies GPT coding laziness](https://github.com/paul-gauthier/refactor-benchmark).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Conclusions and future work
|
||||||
|
|
||||||
|
Based on the refactor benchmark results,
|
||||||
|
aider's new unified diff format seems very effective at stopping
|
||||||
|
GPT-4 Turbo from being a lazy coder.
|
||||||
|
|
||||||
|
Unified diffs was one of the very first edit formats I tried
|
||||||
|
when originally building aider.
|
||||||
|
I think a lot of other AI coding assistant projects have also
|
||||||
|
tried going down this path.
|
||||||
|
It seems like any naive or direct use of structured diff formats
|
||||||
|
is pretty much doomed to failure.
|
||||||
|
But the techniques described here and
|
||||||
|
incorporated into aider provide
|
||||||
|
a highly effective way to harness GPT's knowledge of unified diffs.
|
||||||
|
|
||||||
|
There could be significant benefits to
|
||||||
|
fine tuning models on
|
||||||
|
aider's simple, high level style of unified diffs.
|
||||||
|
Dropping line numbers from the hunk headers and focusing on diffs of
|
||||||
|
semantically coherent chunks of code
|
||||||
|
seems to be an important part of successful GPT code editing.
|
||||||
|
Most LLMs will have already seen plenty of unified diffs
|
||||||
|
in their normal training data, and so should be
|
||||||
|
very amenable to fining tuning towards this
|
||||||
|
particular diff style.
|
|
@ -20,3 +20,4 @@ sounddevice
|
||||||
soundfile
|
soundfile
|
||||||
PyYAML
|
PyYAML
|
||||||
Pillow
|
Pillow
|
||||||
|
diff-match-patch
|
||||||
|
|
|
@ -29,6 +29,8 @@ charset-normalizer==3.3.2
|
||||||
# via requests
|
# via requests
|
||||||
configargparse==1.7
|
configargparse==1.7
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
|
diff-match-patch==20230430
|
||||||
|
# via -r requirements.in
|
||||||
diskcache==5.6.3
|
diskcache==5.6.3
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
distro==1.8.0
|
distro==1.8.0
|
||||||
|
|
|
@ -10,7 +10,7 @@ from aider import models
|
||||||
from aider.coders import Coder
|
from aider.coders import Coder
|
||||||
from aider.dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
from aider.io import InputOutput
|
from aider.io import InputOutput
|
||||||
from tests.utils import ChdirTemporaryDirectory, GitTemporaryDirectory
|
from aider.utils import ChdirTemporaryDirectory, GitTemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
class TestCoder(unittest.TestCase):
|
class TestCoder(unittest.TestCase):
|
||||||
|
|
|
@ -14,7 +14,7 @@ from aider.coders import Coder
|
||||||
from aider.commands import Commands
|
from aider.commands import Commands
|
||||||
from aider.dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
from aider.io import InputOutput
|
from aider.io import InputOutput
|
||||||
from tests.utils import ChdirTemporaryDirectory, GitTemporaryDirectory, make_repo
|
from aider.utils import ChdirTemporaryDirectory, GitTemporaryDirectory, make_repo
|
||||||
|
|
||||||
|
|
||||||
class TestCommands(TestCase):
|
class TestCommands(TestCase):
|
||||||
|
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from aider.io import AutoCompleter, InputOutput
|
from aider.io import AutoCompleter, InputOutput
|
||||||
from tests.utils import ChdirTemporaryDirectory
|
from aider.utils import ChdirTemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
class TestInputOutput(unittest.TestCase):
|
class TestInputOutput(unittest.TestCase):
|
||||||
|
|
|
@ -13,7 +13,7 @@ from prompt_toolkit.output import DummyOutput
|
||||||
from aider.dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
from aider.io import InputOutput
|
from aider.io import InputOutput
|
||||||
from aider.main import check_gitignore, main, setup_git
|
from aider.main import check_gitignore, main, setup_git
|
||||||
from tests.utils import GitTemporaryDirectory, make_repo
|
from aider.utils import GitTemporaryDirectory, make_repo
|
||||||
|
|
||||||
|
|
||||||
class TestMain(TestCase):
|
class TestMain(TestCase):
|
||||||
|
|
|
@ -9,7 +9,7 @@ import git
|
||||||
from aider.dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
from aider.io import InputOutput
|
from aider.io import InputOutput
|
||||||
from aider.repo import GitRepo
|
from aider.repo import GitRepo
|
||||||
from tests.utils import GitTemporaryDirectory
|
from aider.utils import GitTemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
class TestRepo(unittest.TestCase):
|
class TestRepo(unittest.TestCase):
|
||||||
|
|
|
@ -4,7 +4,7 @@ import unittest
|
||||||
from aider.dump import dump # noqa: F401
|
from aider.dump import dump # noqa: F401
|
||||||
from aider.io import InputOutput
|
from aider.io import InputOutput
|
||||||
from aider.repomap import RepoMap
|
from aider.repomap import RepoMap
|
||||||
from tests.utils import IgnorantTemporaryDirectory
|
from aider.utils import IgnorantTemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
class TestRepoMap(unittest.TestCase):
|
class TestRepoMap(unittest.TestCase):
|
||||||
|
|
|
@ -1,56 +0,0 @@
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
import git
|
|
||||||
|
|
||||||
from aider.dump import dump # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
class IgnorantTemporaryDirectory:
|
|
||||||
def __init__(self):
|
|
||||||
self.temp_dir = tempfile.TemporaryDirectory()
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
return self.temp_dir.__enter__()
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
try:
|
|
||||||
self.temp_dir.__exit__(exc_type, exc_val, exc_tb)
|
|
||||||
except (OSError, PermissionError):
|
|
||||||
pass # Ignore errors (Windows)
|
|
||||||
|
|
||||||
|
|
||||||
class ChdirTemporaryDirectory(IgnorantTemporaryDirectory):
|
|
||||||
def __init__(self):
|
|
||||||
self.cwd = os.getcwd()
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
res = super().__enter__()
|
|
||||||
os.chdir(self.temp_dir.name)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
os.chdir(self.cwd)
|
|
||||||
super().__exit__(exc_type, exc_val, exc_tb)
|
|
||||||
|
|
||||||
|
|
||||||
class GitTemporaryDirectory(ChdirTemporaryDirectory):
|
|
||||||
def __enter__(self):
|
|
||||||
res = super().__enter__()
|
|
||||||
self.repo = make_repo()
|
|
||||||
return res
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
del self.repo
|
|
||||||
super().__exit__(exc_type, exc_val, exc_tb)
|
|
||||||
|
|
||||||
|
|
||||||
def make_repo(path=None):
|
|
||||||
if not path:
|
|
||||||
path = "."
|
|
||||||
repo = git.Repo.init(path)
|
|
||||||
repo.config_writer().set_value("user", "name", "Test User").release()
|
|
||||||
repo.config_writer().set_value("user", "email", "testuser@example.com").release()
|
|
||||||
|
|
||||||
return repo
|
|
Loading…
Add table
Add a link
Reference in a new issue