import difflib import math import re import sys from difflib import SequenceMatcher from pathlib import Path from aider import utils from ..dump import dump # noqa: F401 from .base_coder import Coder from .editblock_prompts import EditBlockPrompts class EditBlockCoder(Coder): """A coder that uses search/replace blocks for code modifications.""" edit_format = "diff" gpt_prompts = EditBlockPrompts() def get_edits(self): content = self.partial_response_content # might raise ValueError for malformed ORIG/UPD blocks edits = list( find_original_update_blocks( content, self.fence, self.get_inchat_relative_files(), ) ) self.shell_commands += [edit[1] for edit in edits if edit[0] is None] edits = [edit for edit in edits if edit[0] is not None] return edits def apply_edits_dry_run(self, edits): return self.apply_edits(edits, dry_run=True) def apply_edits(self, edits, dry_run=False): failed = [] passed = [] updated_edits = [] for edit in edits: path, original, updated = edit full_path = self.abs_root_path(path) new_content = None if Path(full_path).exists(): content = self.io.read_text(full_path) new_content = do_replace(full_path, content, original, updated, self.fence) # If the edit failed, and # this is not a "create a new file" with an empty original... # https://github.com/Aider-AI/aider/issues/2258 if not new_content and original.strip(): # try patching any of the other files in the chat for full_path in self.abs_fnames: content = self.io.read_text(full_path) new_content = do_replace(full_path, content, original, updated, self.fence) if new_content: path = self.get_rel_fname(full_path) break updated_edits.append((path, original, updated)) if new_content: if not dry_run: self.io.write_text(full_path, new_content) passed.append(edit) else: failed.append(edit) if dry_run: return updated_edits if not failed: return blocks = "block" if len(failed) == 1 else "blocks" res = f"# {len(failed)} SEARCH/REPLACE {blocks} failed to match!\n" for edit in failed: path, original, updated = edit full_path = self.abs_root_path(path) content = self.io.read_text(full_path) res += f""" ## SearchReplaceNoExactMatch: This SEARCH block failed to exactly match lines in {path} <<<<<<< SEARCH {original}======= {updated}>>>>>>> REPLACE """ did_you_mean = find_similar_lines(original, content) if did_you_mean: res += f"""Did you mean to match some of these actual lines from {path}? {self.fence[0]} {did_you_mean} {self.fence[1]} """ if updated in content and updated: res += f"""Are you sure you need this SEARCH/REPLACE block? The REPLACE lines are already in {path}! """ res += ( "The SEARCH section must exactly match an existing block of lines including all white" " space, comments, indentation, docstrings, etc\n" ) if passed: pblocks = "block" if len(passed) == 1 else "blocks" res += f""" # The other {len(passed)} SEARCH/REPLACE {pblocks} were applied successfully. Don't re-send them. Just reply with fixed versions of the {blocks} above that failed to match. """ raise ValueError(res) def prep(content): if content and not content.endswith("\n"): content += "\n" lines = content.splitlines(keepends=True) return content, lines def perfect_or_whitespace(whole_lines, part_lines, replace_lines): # Try for a perfect match res = perfect_replace(whole_lines, part_lines, replace_lines) if res: return res # Try being flexible about leading whitespace res = replace_part_with_missing_leading_whitespace(whole_lines, part_lines, replace_lines) if res: return res def perfect_replace(whole_lines, part_lines, replace_lines): part_tup = tuple(part_lines) part_len = len(part_lines) for i in range(len(whole_lines) - part_len + 1): whole_tup = tuple(whole_lines[i : i + part_len]) if part_tup == whole_tup: res = whole_lines[:i] + replace_lines + whole_lines[i + part_len :] return "".join(res) def replace_most_similar_chunk(whole, part, replace): """Best efforts to find the `part` lines in `whole` and replace them with `replace`""" whole, whole_lines = prep(whole) part, part_lines = prep(part) replace, replace_lines = prep(replace) res = perfect_or_whitespace(whole_lines, part_lines, replace_lines) if res: return res # drop leading empty line, GPT sometimes adds them spuriously (issue #25) if len(part_lines) > 2 and not part_lines[0].strip(): skip_blank_line_part_lines = part_lines[1:] res = perfect_or_whitespace(whole_lines, skip_blank_line_part_lines, replace_lines) if res: return res # Try to handle when it elides code with ... try: res = try_dotdotdots(whole, part, replace) if res: return res except ValueError: pass return # Try fuzzy matching res = replace_closest_edit_distance(whole_lines, part, part_lines, replace_lines) if res: return res def try_dotdotdots(whole, part, replace): """ See if the edit block has ... lines. If not, return none. If yes, try and do a perfect edit with the ... chunks. If there's a mismatch or otherwise imperfect edit, raise ValueError. If perfect edit succeeds, return the updated whole. """ dots_re = re.compile(r"(^\s*\.\.\.\n)", re.MULTILINE | re.DOTALL) part_pieces = re.split(dots_re, part) replace_pieces = re.split(dots_re, replace) if len(part_pieces) != len(replace_pieces): raise ValueError("Unpaired ... in SEARCH/REPLACE block") if len(part_pieces) == 1: # no dots in this edit block, just return None return # Compare odd strings in part_pieces and replace_pieces all_dots_match = all(part_pieces[i] == replace_pieces[i] for i in range(1, len(part_pieces), 2)) if not all_dots_match: raise ValueError("Unmatched ... in SEARCH/REPLACE block") part_pieces = [part_pieces[i] for i in range(0, len(part_pieces), 2)] replace_pieces = [replace_pieces[i] for i in range(0, len(replace_pieces), 2)] pairs = zip(part_pieces, replace_pieces) for part, replace in pairs: if not part and not replace: continue if not part and replace: if not whole.endswith("\n"): whole += "\n" whole += replace continue if whole.count(part) == 0: raise ValueError if whole.count(part) > 1: raise ValueError whole = whole.replace(part, replace, 1) return whole def replace_part_with_missing_leading_whitespace(whole_lines, part_lines, replace_lines): # GPT often messes up leading whitespace. # It usually does it uniformly across the ORIG and UPD blocks. # Either omitting all leading whitespace, or including only some of it. # Outdent everything in part_lines and replace_lines by the max fixed amount possible leading = [len(p) - len(p.lstrip()) for p in part_lines if p.strip()] + [ len(p) - len(p.lstrip()) for p in replace_lines if p.strip() ] if leading and min(leading): num_leading = min(leading) part_lines = [p[num_leading:] if p.strip() else p for p in part_lines] replace_lines = [p[num_leading:] if p.strip() else p for p in replace_lines] # can we find an exact match not including the leading whitespace num_part_lines = len(part_lines) for i in range(len(whole_lines) - num_part_lines + 1): add_leading = match_but_for_leading_whitespace( whole_lines[i : i + num_part_lines], part_lines ) if add_leading is None: continue replace_lines = [add_leading + rline if rline.strip() else rline for rline in replace_lines] whole_lines = whole_lines[:i] + replace_lines + whole_lines[i + num_part_lines :] return "".join(whole_lines) return None def match_but_for_leading_whitespace(whole_lines, part_lines): num = len(whole_lines) # does the non-whitespace all agree? if not all(whole_lines[i].lstrip() == part_lines[i].lstrip() for i in range(num)): return # are they all offset the same? add = set( whole_lines[i][: len(whole_lines[i]) - len(part_lines[i])] for i in range(num) if whole_lines[i].strip() ) if len(add) != 1: return return add.pop() def replace_closest_edit_distance(whole_lines, part, part_lines, replace_lines): similarity_thresh = 0.8 max_similarity = 0 most_similar_chunk_start = -1 most_similar_chunk_end = -1 scale = 0.1 min_len = math.floor(len(part_lines) * (1 - scale)) max_len = math.ceil(len(part_lines) * (1 + scale)) for length in range(min_len, max_len): for i in range(len(whole_lines) - length + 1): chunk = whole_lines[i : i + length] chunk = "".join(chunk) similarity = SequenceMatcher(None, chunk, part).ratio() if similarity > max_similarity and similarity: max_similarity = similarity most_similar_chunk_start = i most_similar_chunk_end = i + length if max_similarity < similarity_thresh: return modified_whole = ( whole_lines[:most_similar_chunk_start] + replace_lines + whole_lines[most_similar_chunk_end:] ) modified_whole = "".join(modified_whole) return modified_whole DEFAULT_FENCE = ("`" * 3, "`" * 3) def strip_quoted_wrapping(res, fname=None, fence=DEFAULT_FENCE): """ Given an input string which may have extra "wrapping" around it, remove the wrapping. For example: filename.ext ``` We just want this content Not the filename and triple quotes ``` """ if not res: return res res = res.splitlines() if fname and res[0].strip().endswith(Path(fname).name): res = res[1:] if res[0].startswith(fence[0]) and res[-1].startswith(fence[1]): res = res[1:-1] res = "\n".join(res) if res and res[-1] != "\n": res += "\n" return res def do_replace(fname, content, before_text, after_text, fence=None): before_text = strip_quoted_wrapping(before_text, fname, fence) after_text = strip_quoted_wrapping(after_text, fname, fence) fname = Path(fname) # does it want to make a new file? if not fname.exists() and not before_text.strip(): fname.touch() content = "" if content is None: return if not before_text.strip(): # append to existing file, or start a new file new_content = content + after_text else: new_content = replace_most_similar_chunk(content, before_text, after_text) return new_content HEAD = r"^<{5,9} SEARCH\s*$" DIVIDER = r"^={5,9}\s*$" UPDATED = r"^>{5,9} REPLACE\s*$" HEAD_ERR = "<<<<<<< SEARCH" DIVIDER_ERR = "=======" UPDATED_ERR = ">>>>>>> REPLACE" separators = "|".join([HEAD, DIVIDER, UPDATED]) split_re = re.compile(r"^((?:" + separators + r")[ ]*\n)", re.MULTILINE | re.DOTALL) missing_filename_err = ( "Bad/missing filename. The filename must be alone on the line before the opening fence" " {fence[0]}" ) # Always be willing to treat triple-backticks as a fence when searching for filenames triple_backticks = "`" * 3 def strip_filename(filename, fence): filename = filename.strip() if filename == "...": return start_fence = fence[0] if filename.startswith(start_fence) or filename.startswith(triple_backticks): return filename = filename.rstrip(":") filename = filename.lstrip("#") filename = filename.strip() filename = filename.strip("`") filename = filename.strip("*") # https://github.com/Aider-AI/aider/issues/1158 # filename = filename.replace("\\_", "_") return filename def find_original_update_blocks(content, fence=DEFAULT_FENCE, valid_fnames=None): lines = content.splitlines(keepends=True) i = 0 current_filename = None head_pattern = re.compile(HEAD) divider_pattern = re.compile(DIVIDER) updated_pattern = re.compile(UPDATED) while i < len(lines): line = lines[i] # Check for shell code blocks shell_starts = [ "```bash", "```sh", "```shell", "```cmd", "```batch", "```powershell", "```ps1", "```zsh", "```fish", "```ksh", "```csh", "```tcsh", ] next_is_editblock = i + 1 < len(lines) and head_pattern.match(lines[i + 1].strip()) if any(line.strip().startswith(start) for start in shell_starts) and not next_is_editblock: shell_content = [] i += 1 while i < len(lines) and not lines[i].strip().startswith("```"): shell_content.append(lines[i]) i += 1 if i < len(lines) and lines[i].strip().startswith("```"): i += 1 # Skip the closing ``` yield None, "".join(shell_content) continue # Check for SEARCH/REPLACE blocks if head_pattern.match(line.strip()): try: # if next line after HEAD exists and is DIVIDER, it's a new file if i + 1 < len(lines) and divider_pattern.match(lines[i + 1].strip()): filename = find_filename(lines[max(0, i - 3) : i], fence, None) else: filename = find_filename(lines[max(0, i - 3) : i], fence, valid_fnames) if not filename: if current_filename: filename = current_filename else: raise ValueError(missing_filename_err.format(fence=fence)) current_filename = filename original_text = [] i += 1 while i < len(lines) and not divider_pattern.match(lines[i].strip()): original_text.append(lines[i]) i += 1 if i >= len(lines) or not divider_pattern.match(lines[i].strip()): raise ValueError(f"Expected `{DIVIDER_ERR}`") updated_text = [] i += 1 while i < len(lines) and not ( updated_pattern.match(lines[i].strip()) or divider_pattern.match(lines[i].strip()) ): updated_text.append(lines[i]) i += 1 if i >= len(lines) or not ( updated_pattern.match(lines[i].strip()) or divider_pattern.match(lines[i].strip()) ): raise ValueError(f"Expected `{UPDATED_ERR}` or `{DIVIDER_ERR}`") yield filename, "".join(original_text), "".join(updated_text) except ValueError as e: processed = "".join(lines[: i + 1]) err = e.args[0] raise ValueError(f"{processed}\n^^^ {err}") i += 1 def find_filename(lines, fence, valid_fnames): """ Deepseek Coder v2 has been doing this: ```python word_count.py ``` ```python <<<<<<< SEARCH ... This is a more flexible search back for filenames. """ if valid_fnames is None: valid_fnames = [] # Go back through the 3 preceding lines lines.reverse() lines = lines[:3] filenames = [] for line in lines: # If we find a filename, done filename = strip_filename(line, fence) if filename: filenames.append(filename) # Only continue as long as we keep seeing fences if not line.startswith(fence[0]) and not line.startswith(triple_backticks): break if not filenames: return # pick the *best* filename found # Check for exact match first for fname in filenames: if fname in valid_fnames: return fname # Check for partial match (basename match) for fname in filenames: for vfn in valid_fnames: if fname == Path(vfn).name: return vfn # Perform fuzzy matching with valid_fnames for fname in filenames: close_matches = difflib.get_close_matches(fname, valid_fnames, n=1, cutoff=0.8) if len(close_matches) == 1: return close_matches[0] # If no fuzzy match, look for a file w/extension for fname in filenames: if "." in fname: return fname if filenames: return filenames[0] def find_similar_lines(search_lines, content_lines, threshold=0.6): search_lines = search_lines.splitlines() content_lines = content_lines.splitlines() best_ratio = 0 best_match = None for i in range(len(content_lines) - len(search_lines) + 1): chunk = content_lines[i : i + len(search_lines)] ratio = SequenceMatcher(None, search_lines, chunk).ratio() if ratio > best_ratio: best_ratio = ratio best_match = chunk best_match_i = i if best_ratio < threshold: return "" if best_match[0] == search_lines[0] and best_match[-1] == search_lines[-1]: return "\n".join(best_match) N = 5 best_match_end = min(len(content_lines), best_match_i + len(search_lines) + N) best_match_i = max(0, best_match_i - N) best = content_lines[best_match_i:best_match_end] return "\n".join(best) def main(): history_md = Path(sys.argv[1]).read_text() if not history_md: return messages = utils.split_chat_history_markdown(history_md) for msg in messages: msg = msg["content"] edits = list(find_original_update_blocks(msg)) for fname, before, after in edits: # Compute diff diff = difflib.unified_diff( before.splitlines(keepends=True), after.splitlines(keepends=True), fromfile="before", tofile="after", ) diff = "".join(diff) dump(before) dump(after) dump(diff) if __name__ == "__main__": main()