Merge branch 'sitter-map'

This commit is contained in:
Paul Gauthier 2023-10-20 09:57:45 -07:00
commit 815fdaabc3
28 changed files with 1074 additions and 288 deletions

View file

@ -8,7 +8,7 @@ GPT to edit the code and your own editor to make changes yourself.
Aider makes sure edits from you and GPT are
[committed to git](https://aider.chat/docs/faq.html#how-does-aider-use-git)
with sensible commit messages.
Aider is unique in that it [works well with pre-existing, larger codebases](https://aider.chat/docs/ctags.html).
Aider is unique in that it [works well with pre-existing, larger codebases](https://aider.chat/docs/repomap.html).
<p align="center">
<img src="assets/screencast.svg" alt="aider screencast">
@ -61,7 +61,7 @@ Here are some example transcripts that show how you can chat with `aider` to wri
* [**Complex Multi-file Change with Debugging**](https://aider.chat/examples/complex-change.html): GPT makes a complex code change that is coordinated across multiple source files, and resolves bugs by reviewing error output and doc snippets.
* [**Create a Black Box Test Case**](https://aider.chat/examples/add-test.html): GPT creates a "black box" test case without access to the source of the method being tested, using only a
[high level map of the repository based on ctags](https://aider.chat/docs/ctags.html).
[high level map of the repository based on tree-sitter](https://aider.chat/docs/repomap.html).
You can find more chat transcripts on the [examples page](https://aider.chat/examples/).
@ -73,7 +73,7 @@ You can find more chat transcripts on the [examples page](https://aider.chat/exa
* Aider will apply the edits suggested by GPT directly to your source files.
* Aider will [automatically commit each changeset to your local git repo](https://aider.chat/docs/faq.html#how-does-aider-use-git) with a descriptive commit message. These frequent, automatic commits provide a safety net. It's easy to undo changes or use standard git workflows to manage longer sequences of changes.
* You can use aider with multiple source files at once, so GPT can make coordinated code changes across all of them in a single changeset/commit.
* Aider can [give *GPT-4* a map of your entire git repo](https://aider.chat/docs/ctags.html), which helps it understand and modify large codebases.
* Aider can [give *GPT-4* a map of your entire git repo](https://aider.chat/docs/repomap.html), which helps it understand and modify large codebases.
* You can also edit files by hand using your editor while chatting with aider. Aider will notice these out-of-band edits and keep GPT up to date with the latest versions of your files. This lets you bounce back and forth between the aider chat and your editor, to collaboratively code with GPT.
@ -134,7 +134,7 @@ Aider has some ability to help GPT figure out which files to edit all by itself,
* Use Meta-ENTER (Esc+ENTER in some environments) to enter multiline chat messages. Or enter `{` alone on the first line to start a multiline message and `}` alone on the last line to end it.
* If your code is throwing an error, share the error output with GPT using `/run` or by pasting it into the chat. Let GPT figure out and fix the bug.
* GPT knows about a lot of standard tools and libraries, but may get some of the fine details wrong about APIs and function arguments. You can paste doc snippets into the chat to resolve these issues.
* GPT can only see the content of the files you specifically "add to the chat". Aider also sends GPT-4 a [map of your entire git repo](https://aider.chat/docs/ctags.html). So GPT may ask to see additional files if it feels that's needed for your requests.
* GPT can only see the content of the files you specifically "add to the chat". Aider also sends GPT-4 a [map of your entire git repo](https://aider.chat/docs/repomap.html). So GPT may ask to see additional files if it feels that's needed for your requests.
* I also shared some general [GPT coding tips on Hacker News](https://news.ycombinator.com/item?id=36211879).

View file

@ -177,17 +177,10 @@ class Coder:
self.verbose,
)
if self.repo_map.use_ctags:
self.io.tool_output(f"Repo-map: universal-ctags using {map_tokens} tokens")
elif not self.repo_map.has_ctags and map_tokens > 0:
self.io.tool_output(
f"Repo-map: basic using {map_tokens} tokens"
f" ({self.repo_map.ctags_disabled_reason})"
)
else:
self.io.tool_output("Repo-map: disabled because map_tokens == 0")
if map_tokens > 0:
self.io.tool_output(f"Repo-map: using {map_tokens} tokens")
else:
self.io.tool_output("Repo-map: disabled")
self.io.tool_output("Repo-map: disabled because map_tokens == 0")
for fname in self.get_inchat_relative_files():
self.io.tool_output(f"Added {fname} to the chat.")

View file

@ -0,0 +1,46 @@
(class_declaration
name: (identifier) @name.definition.class
) @definition.class
(class_declaration
bases: (base_list (_) @name.reference.class)
) @reference.class
(interface_declaration
name: (identifier) @name.definition.interface
) @definition.interface
(interface_declaration
bases: (base_list (_) @name.reference.interface)
) @reference.interface
(method_declaration
name: (identifier) @name.definition.method
) @definition.method
(object_creation_expression
type: (identifier) @name.reference.class
) @reference.class
(type_parameter_constraints_clause
target: (identifier) @name.reference.class
) @reference.class
(type_constraint
type: (identifier) @name.reference.class
) @reference.class
(variable_declaration
type: (identifier) @name.reference.class
) @reference.class
(invocation_expression
function:
(member_access_expression
name: (identifier) @name.reference.send
)
) @reference.send
(namespace_declaration
name: (identifier) @name.definition.module
) @definition.module

View file

@ -0,0 +1,9 @@
(struct_specifier name: (type_identifier) @name.definition.class body:(_)) @definition.class
(declaration type: (union_specifier name: (type_identifier) @name.definition.class)) @definition.class
(function_declarator declarator: (identifier) @name.definition.function) @definition.function
(type_definition declarator: (type_identifier) @name.definition.type) @definition.type
(enum_specifier name: (type_identifier) @name.definition.type) @definition.type

View file

@ -0,0 +1,15 @@
(struct_specifier name: (type_identifier) @name.definition.class body:(_)) @definition.class
(declaration type: (union_specifier name: (type_identifier) @name.definition.class)) @definition.class
(function_declarator declarator: (identifier) @name.definition.function) @definition.function
(function_declarator declarator: (field_identifier) @name.definition.function) @definition.function
(function_declarator declarator: (qualified_identifier scope: (namespace_identifier) @scope name: (identifier) @name.definition.method)) @definition.method
(type_definition declarator: (type_identifier) @name.definition.type) @definition.type
(enum_specifier name: (type_identifier) @name.definition.type) @definition.type
(class_specifier name: (type_identifier) @name.definition.class) @definition.class

View file

@ -0,0 +1,5 @@
;; defun/defsubst
(function_definition name: (symbol) @name.definition.function) @definition.function
;; Treat macros as function definitions for the sake of TAGS.
(macro_definition name: (symbol) @name.definition.function) @definition.function

View file

@ -0,0 +1,54 @@
; Definitions
; * modules and protocols
(call
target: (identifier) @ignore
(arguments (alias) @name.definition.module)
(#match? @ignore "^(defmodule|defprotocol)$")) @definition.module
; * functions/macros
(call
target: (identifier) @ignore
(arguments
[
; zero-arity functions with no parentheses
(identifier) @name.definition.function
; regular function clause
(call target: (identifier) @name.definition.function)
; function clause with a guard clause
(binary_operator
left: (call target: (identifier) @name.definition.function)
operator: "when")
])
(#match? @ignore "^(def|defp|defdelegate|defguard|defguardp|defmacro|defmacrop|defn|defnp)$")) @definition.function
; References
; ignore calls to kernel/special-forms keywords
(call
target: (identifier) @ignore
(#match? @ignore "^(def|defp|defdelegate|defguard|defguardp|defmacro|defmacrop|defn|defnp|defmodule|defprotocol|defimpl|defstruct|defexception|defoverridable|alias|case|cond|else|for|if|import|quote|raise|receive|require|reraise|super|throw|try|unless|unquote|unquote_splicing|use|with)$"))
; ignore module attributes
(unary_operator
operator: "@"
operand: (call
target: (identifier) @ignore))
; * function call
(call
target: [
; local
(identifier) @name.reference.call
; remote
(dot
right: (identifier) @name.reference.call)
]) @reference.call
; * pipe into function call
(binary_operator
operator: "|>"
right: (identifier) @name.reference.call) @reference.call
; * modules
(alias) @name.reference.module @reference.module

View file

@ -0,0 +1,19 @@
(value_declaration (function_declaration_left (lower_case_identifier) @name.definition.function)) @definition.function
(function_call_expr (value_expr (value_qid) @name.reference.function)) @reference.function
(exposed_value (lower_case_identifier) @name.reference.function)) @reference.function
(type_annotation ((lower_case_identifier) @name.reference.function) (colon)) @reference.function
(type_declaration ((upper_case_identifier) @name.definition.type) ) @definition.type
(type_ref (upper_case_qid (upper_case_identifier) @name.reference.type)) @reference.type
(exposed_type (upper_case_identifier) @name.reference.type)) @reference.type
(type_declaration (union_variant (upper_case_identifier) @name.definition.union)) @definition.union
(value_expr (upper_case_qid (upper_case_identifier) @name.reference.union)) @reference.union
(module_declaration
(upper_case_qid (upper_case_identifier)) @name.definition.module
) @definition.module

View file

@ -0,0 +1,30 @@
(
(comment)* @doc
.
(function_declaration
name: (identifier) @name.definition.function) @definition.function
(#strip! @doc "^//\\s*")
(#set-adjacent! @doc @definition.function)
)
(
(comment)* @doc
.
(method_declaration
name: (field_identifier) @name.definition.method) @definition.method
(#strip! @doc "^//\\s*")
(#set-adjacent! @doc @definition.method)
)
(call_expression
function: [
(identifier) @name.reference.call
(parenthesized_expression (identifier) @name.reference.call)
(selector_expression field: (field_identifier) @name.reference.call)
(parenthesized_expression (selector_expression field: (field_identifier) @name.reference.call))
]) @reference.call
(type_spec
name: (type_identifier) @name.definition.type) @definition.type
(type_identifier) @name.reference.type @reference.type

View file

@ -0,0 +1,20 @@
(class_declaration
name: (identifier) @name.definition.class) @definition.class
(method_declaration
name: (identifier) @name.definition.method) @definition.method
(method_invocation
name: (identifier) @name.reference.call
arguments: (argument_list) @reference.call)
(interface_declaration
name: (identifier) @name.definition.interface) @definition.interface
(type_list
(type_identifier) @name.reference.implementation) @reference.implementation
(object_creation_expression
type: (type_identifier) @name.reference.class) @reference.class
(superclass (type_identifier) @name.reference.class) @reference.class

View file

@ -0,0 +1,88 @@
(
(comment)* @doc
.
(method_definition
name: (property_identifier) @name.definition.method) @definition.method
(#not-eq? @name.definition.method "constructor")
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.method)
)
(
(comment)* @doc
.
[
(class
name: (_) @name.definition.class)
(class_declaration
name: (_) @name.definition.class)
] @definition.class
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.class)
)
(
(comment)* @doc
.
[
(function
name: (identifier) @name.definition.function)
(function_declaration
name: (identifier) @name.definition.function)
(generator_function
name: (identifier) @name.definition.function)
(generator_function_declaration
name: (identifier) @name.definition.function)
] @definition.function
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.function)
)
(
(comment)* @doc
.
(lexical_declaration
(variable_declarator
name: (identifier) @name.definition.function
value: [(arrow_function) (function)]) @definition.function)
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.function)
)
(
(comment)* @doc
.
(variable_declaration
(variable_declarator
name: (identifier) @name.definition.function
value: [(arrow_function) (function)]) @definition.function)
(#strip! @doc "^[\\s\\*/]+|^[\\s\\*/]$")
(#select-adjacent! @doc @definition.function)
)
(assignment_expression
left: [
(identifier) @name.definition.function
(member_expression
property: (property_identifier) @name.definition.function)
]
right: [(arrow_function) (function)]
) @definition.function
(pair
key: (property_identifier) @name.definition.function
value: [(arrow_function) (function)]) @definition.function
(
(call_expression
function: (identifier) @name.reference.call) @reference.call
(#not-match? @name.reference.call "^(require)$")
)
(call_expression
function: (member_expression
property: (property_identifier) @name.reference.call)
arguments: (_) @reference.call)
(new_expression
constructor: (_) @name.reference.class) @reference.class

View file

@ -0,0 +1,116 @@
; Modules
;--------
(
(comment)? @doc .
(module_definition (module_binding (module_name) @name.definition.module) @definition.module)
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
(module_path (module_name) @name.reference.module) @reference.module
; Modules types
;--------------
(
(comment)? @doc .
(module_type_definition (module_type_name) @name.definition.interface) @definition.interface
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
(module_type_path (module_type_name) @name.reference.implementation) @reference.implementation
; Functions
;----------
(
(comment)? @doc .
(value_definition
[
(let_binding
pattern: (value_name) @name.definition.function
(parameter))
(let_binding
pattern: (value_name) @name.definition.function
body: [(fun_expression) (function_expression)])
] @definition.function
)
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
(
(comment)? @doc .
(external (value_name) @name.definition.function) @definition.function
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
(application_expression
function: (value_path (value_name) @name.reference.call)) @reference.call
(infix_expression
left: (value_path (value_name) @name.reference.call)
(infix_operator) @reference.call
(#eq? @reference.call "@@"))
(infix_expression
(infix_operator) @reference.call
right: (value_path (value_name) @name.reference.call)
(#eq? @reference.call "|>"))
; Operator
;---------
(
(comment)? @doc .
(value_definition
(let_binding
pattern: (parenthesized_operator [
(prefix_operator)
(infix_operator)
(hash_operator)
(indexing_operator)
(let_operator)
(and_operator)
(match_operator)
] @name.definition.function)) @definition.function)
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
[
(prefix_operator)
(sign_operator)
(infix_operator)
(hash_operator)
(indexing_operator)
(let_operator)
(and_operator)
(match_operator)
] @name.reference.call @reference.call
; Classes
;--------
(
(comment)? @doc .
[
(class_definition (class_binding (class_name) @name.definition.class) @definition.class)
(class_type_definition (class_type_binding (class_type_name) @name.definition.class) @definition.class)
]
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
[
(class_path (class_name) @name.reference.class)
(class_type_path (class_type_name) @name.reference.class)
] @reference.class
; Methods
;--------
(
(comment)? @doc .
(method_definition (method_name) @name.definition.method) @definition.method
(#strip! @doc "^\\(\\*\\*?\\s*|\\s\\*\\)$")
)
(method_invocation (method_name) @name.reference.call) @reference.call

View file

@ -0,0 +1,26 @@
(class_declaration
name: (name) @name.definition.class) @definition.class
(function_definition
name: (name) @name.definition.function) @definition.function
(method_declaration
name: (name) @name.definition.function) @definition.function
(object_creation_expression
[
(qualified_name (name) @name.reference.class)
(variable_name (name) @name.reference.class)
]) @reference.class
(function_call_expression
function: [
(qualified_name (name) @name.reference.call)
(variable_name (name)) @name.reference.call
]) @reference.call
(scoped_call_expression
name: (name) @name.reference.call) @reference.call
(member_call_expression
name: (name) @name.reference.call) @reference.call

View file

@ -0,0 +1,12 @@
(class_definition
name: (identifier) @name.definition.class) @definition.class
(function_definition
name: (identifier) @name.definition.function) @definition.function
(call
function: [
(identifier) @name.reference.call
(attribute
attribute: (identifier) @name.reference.call)
]) @reference.call

View file

@ -0,0 +1,26 @@
(classlessPredicate
name: (predicateName) @name.definition.function) @definition.function
(memberPredicate
name: (predicateName) @name.definition.method) @definition.method
(aritylessPredicateExpr
name: (literalId) @name.reference.call) @reference.call
(module
name: (moduleName) @name.definition.module) @definition.module
(dataclass
name: (className) @name.definition.class) @definition.class
(datatype
name: (className) @name.definition.class) @definition.class
(datatypeBranch
name: (className) @name.definition.class) @definition.class
(qualifiedRhs
name: (predicateName) @name.reference.call) @reference.call
(typeExpr
name: (className) @name.reference.type) @reference.type

View file

View file

@ -0,0 +1,64 @@
; Method definitions
(
(comment)* @doc
.
[
(method
name: (_) @name.definition.method) @definition.method
(singleton_method
name: (_) @name.definition.method) @definition.method
]
(#strip! @doc "^#\\s*")
(#select-adjacent! @doc @definition.method)
)
(alias
name: (_) @name.definition.method) @definition.method
(setter
(identifier) @ignore)
; Class definitions
(
(comment)* @doc
.
[
(class
name: [
(constant) @name.definition.class
(scope_resolution
name: (_) @name.definition.class)
]) @definition.class
(singleton_class
value: [
(constant) @name.definition.class
(scope_resolution
name: (_) @name.definition.class)
]) @definition.class
]
(#strip! @doc "^#\\s*")
(#select-adjacent! @doc @definition.class)
)
; Module definitions
(
(module
name: [
(constant) @name.definition.module
(scope_resolution
name: (_) @name.definition.module)
]) @definition.module
)
; Calls
(call method: (identifier) @name.reference.call) @reference.call
(
[(identifier) (constant)] @name.reference.call @reference.call
(#is-not? local)
(#not-match? @name.reference.call "^(lambda|load|require|require_relative|__FILE__|__LINE__)$")
)

View file

@ -0,0 +1,60 @@
; ADT definitions
(struct_item
name: (type_identifier) @name.definition.class) @definition.class
(enum_item
name: (type_identifier) @name.definition.class) @definition.class
(union_item
name: (type_identifier) @name.definition.class) @definition.class
; type aliases
(type_item
name: (type_identifier) @name.definition.class) @definition.class
; method definitions
(declaration_list
(function_item
name: (identifier) @name.definition.method)) @definition.method
; function definitions
(function_item
name: (identifier) @name.definition.function) @definition.function
; trait definitions
(trait_item
name: (type_identifier) @name.definition.interface) @definition.interface
; module definitions
(mod_item
name: (identifier) @name.definition.module) @definition.module
; macro definitions
(macro_definition
name: (identifier) @name.definition.macro) @definition.macro
; references
(call_expression
function: (identifier) @name.reference.call) @reference.call
(call_expression
function: (field_expression
field: (field_identifier) @name.reference.call)) @reference.call
(macro_invocation
macro: (identifier) @name.reference.call) @reference.call
; implementations
(impl_item
trait: (type_identifier) @name.reference.implementation) @reference.implementation
(impl_item
type: (type_identifier) @name.reference.implementation
!trait) @reference.implementation

View file

@ -0,0 +1,23 @@
(function_signature
name: (identifier) @name.definition.function) @definition.function
(method_signature
name: (property_identifier) @name.definition.method) @definition.method
(abstract_method_signature
name: (property_identifier) @name.definition.method) @definition.method
(abstract_class_declaration
name: (type_identifier) @name.definition.class) @definition.class
(module
name: (identifier) @name.definition.module) @definition.module
(interface_declaration
name: (type_identifier) @name.definition.interface) @definition.interface
(type_annotation
(type_identifier) @name.reference.type) @reference.type
(new_expression
constructor: (identifier) @name.reference.class) @reference.class

View file

@ -1,79 +1,31 @@
import colorsys
import json
import os
import random
import subprocess
import sys
import tempfile
from collections import Counter, defaultdict
from collections import Counter, defaultdict, namedtuple
from pathlib import Path
import networkx as nx
import pkg_resources
from diskcache import Cache
from grep_ast import TreeContext, filename_to_lang
from pygments.lexers import guess_lexer_for_filename
from pygments.token import Token
from pygments.util import ClassNotFound
from tqdm import tqdm
from tree_sitter_languages import get_language, get_parser
from aider import models
from .dump import dump # noqa: F402
def to_tree(tags):
if not tags:
return ""
tags = sorted(tags)
output = ""
last = [None] * len(tags[0])
tab = "\t"
for tag in tags:
tag = list(tag)
for i in range(len(last) + 1):
if i == len(last):
break
if last[i] != tag[i]:
break
num_common = i
indent = tab * num_common
rest = tag[num_common:]
for item in rest:
output += indent + item + "\n"
indent += tab
last = tag
return output
def fname_to_components(fname, with_colon):
path_components = fname.split(os.sep)
res = [pc + os.sep for pc in path_components[:-1]]
if with_colon:
res.append(path_components[-1] + ":")
else:
res.append(path_components[-1])
return res
Tag = namedtuple("Tag", "rel_fname fname line name kind".split())
class RepoMap:
CACHE_VERSION = 1
ctags_cmd = [
"ctags",
"--fields=+S",
"--extras=-F",
"--output-format=json",
"--output-encoding=utf-8",
]
IDENT_CACHE_DIR = f".aider.ident.cache.v{CACHE_VERSION}"
CACHE_VERSION = 3
TAGS_CACHE_DIR = f".aider.tags.cache.v{CACHE_VERSION}"
ctags_disabled_reason = "ctags not initialized"
cache_missing = False
warned_files = set()
@ -94,26 +46,27 @@ class RepoMap:
root = os.getcwd()
self.root = root
self.load_ident_cache()
self.load_tags_cache()
self.max_map_tokens = map_tokens
self.has_ctags = self.check_for_ctags()
if map_tokens > 0 and self.has_ctags:
self.use_ctags = True
else:
self.use_ctags = False
self.tokenizer = main_model.tokenizer
self.repo_content_prefix = repo_content_prefix
def get_repo_map(self, chat_files, other_files):
res = self.choose_files_listing(chat_files, other_files)
if not res:
if self.max_map_tokens <= 0:
return
files_listing, ctags_msg = res
if not other_files:
return
files_listing = self.get_ranked_tags_map(chat_files, other_files)
if not files_listing:
return
num_tokens = self.token_count(files_listing)
if self.verbose:
self.io.tool_output(f"Repo-map: {num_tokens/1024:.1f} k-tokens")
if chat_files:
other = "other "
@ -121,10 +74,7 @@ class RepoMap:
other = ""
if self.repo_content_prefix:
repo_content = self.repo_content_prefix.format(
other=other,
ctags_msg=ctags_msg,
)
repo_content = self.repo_content_prefix.format(other=other)
else:
repo_content = ""
@ -132,39 +82,6 @@ class RepoMap:
return repo_content
def choose_files_listing(self, chat_files, other_files):
if self.max_map_tokens <= 0:
return
if not other_files:
return
if self.use_ctags:
files_listing = self.get_ranked_tags_map(chat_files, other_files)
if files_listing:
num_tokens = self.token_count(files_listing)
if self.verbose:
self.io.tool_output(f"ctags map: {num_tokens/1024:.1f} k-tokens")
ctags_msg = " with selected ctags info"
return files_listing, ctags_msg
files_listing = self.get_simple_files_map(other_files)
ctags_msg = ""
num_tokens = self.token_count(files_listing)
if self.verbose:
self.io.tool_output(f"simple map: {num_tokens/1024:.1f} k-tokens")
if num_tokens < self.max_map_tokens:
return files_listing, ctags_msg
def get_simple_files_map(self, other_files):
fnames = []
for fname in other_files:
fname = self.get_rel_fname(fname)
fname = fname_to_components(fname, False)
fnames.append(fname)
return to_tree(fnames)
def token_count(self, string):
return len(self.tokenizer.encode(string))
@ -175,66 +92,6 @@ class RepoMap:
path = os.path.relpath(path, self.root)
return [path + ":"]
def run_ctags(self, filename):
# Check if the file is in the cache and if the modification time has not changed
file_mtime = self.get_mtime(filename)
if file_mtime is None:
return []
cache_key = filename
if cache_key in self.TAGS_CACHE and self.TAGS_CACHE[cache_key]["mtime"] == file_mtime:
return self.TAGS_CACHE[cache_key]["data"]
cmd = self.ctags_cmd + [
f"--input-encoding={self.io.encoding}",
filename,
]
output = subprocess.check_output(cmd, stderr=subprocess.PIPE).decode("utf-8")
output_lines = output.splitlines()
data = []
for line in output_lines:
try:
data.append(json.loads(line))
except json.decoder.JSONDecodeError as err:
self.io.tool_error(f"Error parsing ctags output: {err}")
self.io.tool_error(repr(line))
# Update the cache
self.TAGS_CACHE[cache_key] = {"mtime": file_mtime, "data": data}
self.save_tags_cache()
return data
def check_for_ctags(self):
try:
executable = self.ctags_cmd[0]
cmd = [executable, "--version"]
output = subprocess.check_output(cmd, stderr=subprocess.PIPE).decode("utf-8")
output = output.lower()
cmd = " ".join(cmd)
if "universal ctags" not in output:
self.ctags_disabled_reason = f"{cmd} does not claim to be universal ctags"
return
if "+json" not in output:
self.ctags_disabled_reason = f"{cmd} does not list +json support"
return
with tempfile.TemporaryDirectory() as tempdir:
hello_py = os.path.join(tempdir, "hello.py")
with open(hello_py, "w", encoding="utf-8") as f:
f.write("def hello():\n print('Hello, world!')\n")
self.run_ctags(hello_py)
except FileNotFoundError:
self.ctags_disabled_reason = f"{executable} executable not found"
return
except Exception as err:
self.ctags_disabled_reason = f"error running universal-ctags: {err}"
return
return True
def load_tags_cache(self):
path = Path(self.root) / self.TAGS_CACHE_DIR
if not path.exists():
@ -244,52 +101,103 @@ class RepoMap:
def save_tags_cache(self):
pass
def load_ident_cache(self):
path = Path(self.root) / self.IDENT_CACHE_DIR
if not path.exists():
self.cache_missing = True
self.IDENT_CACHE = Cache(path)
def save_ident_cache(self):
pass
def get_mtime(self, fname):
try:
return os.path.getmtime(fname)
except FileNotFoundError:
self.io.tool_error(f"File not found error: {fname}")
def get_name_identifiers(self, fname, uniq=True):
def get_tags(self, fname, rel_fname):
# Check if the file is in the cache and if the modification time has not changed
file_mtime = self.get_mtime(fname)
if file_mtime is None:
return set()
return []
cache_key = fname
if cache_key in self.IDENT_CACHE and self.IDENT_CACHE[cache_key]["mtime"] == file_mtime:
idents = self.IDENT_CACHE[cache_key]["data"]
else:
idents = self.get_name_identifiers_uncached(fname)
self.IDENT_CACHE[cache_key] = {"mtime": file_mtime, "data": idents}
self.save_ident_cache()
if cache_key in self.TAGS_CACHE and self.TAGS_CACHE[cache_key]["mtime"] == file_mtime:
return self.TAGS_CACHE[cache_key]["data"]
if uniq:
idents = set(idents)
return idents
# miss!
def get_name_identifiers_uncached(self, fname):
content = self.io.read_text(fname)
if content is None:
return list()
data = list(self.get_tags_raw(fname, rel_fname))
# Update the cache
self.TAGS_CACHE[cache_key] = {"mtime": file_mtime, "data": data}
self.save_tags_cache()
return data
def get_tags_raw(self, fname, rel_fname):
lang = filename_to_lang(fname)
if not lang:
return
language = get_language(lang)
parser = get_parser(lang)
# Load the tags queries
scm_fname = pkg_resources.resource_filename(
__name__, os.path.join("queries", f"tree-sitter-{lang}-tags.scm")
)
query_scm = Path(scm_fname)
if not query_scm.exists():
return
query_scm = query_scm.read_text()
code = Path(fname).read_text(encoding=self.io.encoding)
tree = parser.parse(bytes(code, "utf-8"))
# Run the tags queries
query = language.query(query_scm)
captures = query.captures(tree.root_node)
captures = list(captures)
saw = set()
for node, tag in captures:
if tag.startswith("name.definition."):
kind = "def"
elif tag.startswith("name.reference."):
kind = "ref"
else:
continue
saw.add(kind)
result = Tag(
rel_fname=rel_fname,
fname=fname,
name=node.text.decode("utf-8"),
kind=kind,
line=node.start_point[0],
)
yield result
if "ref" in saw:
return
if "def" not in saw:
return
# We saw defs, without any refs
# Some tags files only provide defs (cpp, for example)
# Use pygments to backfill refs
try:
lexer = guess_lexer_for_filename(fname, content)
lexer = guess_lexer_for_filename(fname, code)
except ClassNotFound:
return list()
return
# lexer.get_tokens_unprocessed() returns (char position in file, token type, token string)
tokens = list(lexer.get_tokens_unprocessed(content))
res = [token[2] for token in tokens if token[1] in Token.Name]
return res
tokens = list(lexer.get_tokens(code))
tokens = [token[1] for token in tokens if token[0] in Token.Name]
for token in tokens:
yield Tag(
rel_fname=rel_fname,
fname=fname,
name=token,
kind="ref",
line=-1,
)
def get_ranked_tags(self, chat_fnames, other_fnames):
defines = defaultdict(set)
@ -327,34 +235,25 @@ class RepoMap:
personalization[rel_fname] = 1.0
chat_rel_fnames.add(rel_fname)
data = self.run_ctags(fname)
tags = list(self.get_tags(fname, rel_fname))
if tags is None:
continue
for tag in data:
ident = tag["name"]
defines[ident].add(rel_fname)
for tag in tags:
if tag.kind == "def":
defines[tag.name].add(rel_fname)
key = (rel_fname, tag.name)
definitions[key].add(tag)
scope = tag.get("scope")
kind = tag.get("kind")
name = tag.get("name")
signature = tag.get("signature")
if tag.kind == "ref":
references[tag.name].append(rel_fname)
last = name
if signature:
last += " " + signature
##
# dump(defines)
# dump(references)
res = [rel_fname]
if scope:
res.append(scope)
res += [kind, last]
key = (rel_fname, ident)
definitions[key].add(tuple(res))
# definitions[key].add((rel_fname,))
idents = self.get_name_identifiers(fname, uniq=False)
for ident in idents:
# dump("ref", fname, ident)
references[ident].append(rel_fname)
if not references:
references = dict((k, list(v)) for k, v in defines.items())
idents = set(defines.keys()).intersection(set(references.keys()))
@ -364,10 +263,13 @@ class RepoMap:
definers = defines[ident]
for referencer, num_refs in Counter(references[ident]).items():
for definer in definers:
if referencer == definer:
continue
# if referencer == definer:
# continue
G.add_edge(referencer, definer, weight=num_refs, ident=ident)
if not references:
pass
if personalization:
pers_args = dict(personalization=personalization, dangling=personalization)
else:
@ -391,6 +293,9 @@ class RepoMap:
ranked_tags = []
ranked_definitions = sorted(ranked_definitions.items(), reverse=True, key=lambda x: x[1])
# dump(ranked_definitions)
for (fname, ident), rank in ranked_definitions:
# print(f"{rank:.03f} {fname} {ident}")
if fname in chat_rel_fnames:
@ -428,9 +333,8 @@ class RepoMap:
while lower_bound <= upper_bound:
middle = (lower_bound + upper_bound) // 2
tree = to_tree(ranked_tags[:middle])
tree = self.to_tree(ranked_tags[:middle])
num_tokens = self.token_count(tree)
# dump(middle, num_tokens)
if num_tokens < self.max_map_tokens:
best_tree = tree
@ -440,17 +344,63 @@ class RepoMap:
return best_tree
def to_tree(self, tags):
if not tags:
return ""
def find_py_files(directory):
tags = sorted(tags)
cur_fname = None
context = None
output = ""
# add a bogus tag at the end so we trip the this_fname != cur_fname...
dummy_tag = (None,)
for tag in tags + [dummy_tag]:
this_fname = tag[0]
# ... here ... to output the final real entry in the list
if this_fname != cur_fname:
if context:
context.add_context()
output += "\n"
output += cur_fname + ":\n"
output += context.format()
context = None
elif cur_fname:
output += "\n" + cur_fname + "\n"
if type(tag) is Tag:
context = TreeContext(
tag.rel_fname,
Path(tag.fname).read_text(self.io.encoding),
color=False,
line_number=False,
child_context=False,
last_line=False,
margin=0,
mark_lois=False,
loi_pad=0,
header_max=3,
show_top_of_file_parent_scope=False,
)
cur_fname = this_fname
if context:
context.add_lines_of_interest([tag.line])
return output
def find_src_files(directory):
if not os.path.isdir(directory):
return [directory]
py_files = []
src_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".py"):
py_files.append(os.path.join(root, file))
return py_files
src_files.append(os.path.join(root, file))
return src_files
def get_random_color():
@ -465,15 +415,13 @@ if __name__ == "__main__":
chat_fnames = []
other_fnames = []
for dname in sys.argv[1:]:
if ".venv" in dname:
other_fnames += find_py_files(dname)
for fname in sys.argv[1:]:
if Path(fname).is_dir():
chat_fnames += find_src_files(fname)
else:
chat_fnames += find_py_files(dname)
chat_fnames.append(fname)
root = os.path.commonpath(chat_fnames)
rm = RepoMap(root=root)
rm = RepoMap(root=".")
repo_map = rm.get_ranked_tags_map(chat_fnames, other_fnames)
dump(len(repo_map))

BIN
assets/robot-ast.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 671 KiB

View file

@ -3,6 +3,14 @@
![robot flowchat](../assets/robot-flowchart.png)
## Updated
Aider no longer uses ctags to build a repo map.
Please see the newer article about
[using tree-sitter to build a better repo map](https://aider.chat/docs/repomap.html).
-------
GPT-4 is extremely useful for "self-contained" coding tasks,
like generating brand new code or modifying a pure function
that has no dependencies.

View file

@ -20,11 +20,11 @@ Aider is tightly integrated with git, which makes it easy to:
- Manage a series of GPT's changes on a git branch
Aider specifically uses git in these ways:
- It asks to create a git repo if you launch it in a directory without one.
- Whenever GPT edits a file, aider commits those changes with a descriptive commit message. This makes it easy to undo or review GPT's changes.
- Aider takes special care if GPT tries to edit files that already have uncommitted changes (dirty files). Aider will first commit any preexisting changes with a descriptive commit message. This keeps your edits separate from GPT's edits, and makes sure you never lose your work if GPT makes an inappropriate change.
Aider also allows you to use in-chat commands to `/diff` or `/undo` the last change made by GPT.
To do more complex management of your git history, you cat use raw `git` commands,
either by using `/git` within the chat, or with standard git tools outside of aider.
@ -58,7 +58,7 @@ They have large context windows, better coding skills and
they generally obey the instructions in the system prompt.
GPT-4 is able to structure code edits as simple "diffs"
and use a
[repository map](https://aider.chat/docs/ctags.html)
[repository map](https://aider.chat/docs/repomap.html)
to improve its ability to make changes in larger codebases.
GPT-3.5 is supported more experimentally

242
docs/repomap.md Normal file
View file

@ -0,0 +1,242 @@
# Building a better repository map with tree sitter
![robot flowchat](../assets/robot-ast.png)
GPT-4 is extremely useful for "self-contained" coding tasks,
like generating or modifying a simple function
that has no dependencies. Tools like GitHub CoPilot serve
these simple coding tasks well.
But it's much more difficult for humans or AIs to make
complex changes in a larger, pre-existing codebase.
To do this successfully, you need to:
1. Find the code that needs to be changed.
2. Understand how that code relates to the rest of the codebase.
3. Make the correct code change to accomplish the task.
GPT-4 is actually great at making the code changes (3),
once you tell it which files need to be changed (1)
and show it how they fit into the rest of the codebase (2).
This article is going to focus on the problem of "code context" (2), where we need to:
- Help GPT understand the overall codebase, so that it
can decifer the meaning of code with complex dependencies and generate
new code that respects and utilizes existing abstractions.
- Convey all of this "code context" to GPT in an
efficient manner that fits within GPT's context window.
To address these issues, aider
sends GPT a **concise map of your whole git repository**
that includes
the most important classes and functions along with their types and call signatures.
This **repository map** is now built automatically by using
[tree-sitter](https://tree-sitter.github.io/tree-sitter/)
to extract symbol definitions from source files.
Tree-sitter is used by many IDEs, editors and LSP servers to
help humans search and navigate large codebases.
Aider now uses it to help GPT better comprehend, navigate
and edit code in larger repos.
*To code with GPT-4 using the techniques discussed here, just install [aider](https://aider.chat/docs/install.html).*
## The problem: code context
GPT-4 is great at "self contained" coding tasks, like writing or
modifying a pure function with no external dependencies.
GPT can easily handle requests like "write a
Fibonacci function" or "rewrite this loop using list
comprehensions", because they require no context beyond the code
being discussed.
Most real code is not pure and self-contained, it is intertwined with
and depends on code from many different files in a repo.
If you ask GPT to "switch all the print statements in class Foo to
use the BarLog logging system", it needs to see and
modify the code in the Foo class, but it also needs to understand
how to use
the project's BarLog
subsystem.
A simple solution is to **send the entire codebase** to GPT along with
each change request. Now GPT has all the context! But this won't work
for even moderately
sized repos, because they won't fit into the context window.
A better approach is to be selective,
and **hand pick which files to send**.
For the example above, you could send the file that
contains the Foo class
and the file that contains the BarLog logging subsystem.
This works pretty well, and is supported by aider -- you
can manually specify which files to "add to the chat" you are having with GPT.
But sending whole files is a bulky way to send code context,
wasting the precious context window.
GPT doesn't need to see the entire implementation of BarLog,
it just needs to understand it well enough to use it.
You may quickly run out of context window by sending
full files of code
just to convey context.
Aider also strives to reduce the manual work involved in
coding with AI, so it would be better if we could automatically
provide the needed code context.
## Using a repo map to provide context
Aider sends a **repo map** to GPT along with
each request from the user to make a code change.
The map contains a list of the files in the
repo, along with the key symbols which are defined in each file.
It shows how each of these symbols are defined in the
source code, by including the critical lines of code for each definition.
Here's a
sample of the map of the aider repo, just showing the maps of
[io.py](https://github.com/paul-gauthier/aider/blob/main/aider/io.py)
and
[main.py](https://github.com/paul-gauthier/aider/blob/main/aider/main.py)
:
```
aider/io.py:
⋮...
│class InputOutput:
⋮...
│ def read_text(self, filename):
⋮...
│ def write_text(self, filename, content):
⋮...
│ def confirm_ask(self, question, default="y"):
⋮...
│ def tool_error(self, message):
⋮...
│ def tool_output(self, *messages, log_only=False):
⋮...
aider/main.py:
⋮...
│def main(argv=None, input=None, output=None, force_git_root=None):
⋮...
```
Mapping out the repo like this provides some key benefits:
- GPT can see classes, methods and function signatures from everywhere in the repo. This alone may give it enough context to solve many tasks. For example, it can probably figure out how to use the API exported from a module just based on the details shown in the map.
- If it needs to see more code, GPT can use the map to figure out by itself which files it needs to look at in more detail. GPT will then ask to see these specific files, and aider will automatically add them to the chat context.
Of course, for large repositories even just the repo map might be too large
for GPT's context window.
Aider solves this problem by sending just the **most relevant**
portions of the repo map.
It does this by analyzing the full repo map using
a graph ranking algorithm, computed on a graph
where each source file is a node and edges connect
files which have dependencies.
Aider optimizes the repo map by
selecting the most important parts of the codebase
which will
fit into the token budget assigned by the user
(via the `--map-tokens` switch, which defaults to 1k tokens).
The sample map shown above doesn't contain *every* class, method and function from those
files.
It only includes the most important identifiers,
the ones which are most often referenced by other portions of the code.
These are the key piece of context that GPT needs to know to understand
the overall codebase.
## Using tree-sitter to make the map
Under the hood, aider uses
[tree sitter](https://tree-sitter.github.io/tree-sitter/)
to build the
map.
It specifically uses the
[py-tree-sitter-languages](https://github.com/grantjenks/py-tree-sitter-languages)
python module,
which provides simple, pip-installable binary wheels for
[most popular programming languages](https://github.com/paul-gauthier/grep-ast/blob/main/grep_ast/parsers.py).
Tree-sitter parses source code into an Abstract Syntax Tree (AST) based
on the syntax of the programming language.
Using the AST, we can identify where functions, classes, variables, types and
other definitions occur in the source code.
We can also identify where else in the code these things are used or referenced.
Aider uses all of these definitions and references to
determine which are the most important identifiers in the repository,
and to produce the repo map that shows just those key
lines from the codebase.
## What about ctags?
The tree-sitter repository map replaces the
[ctags based map](https://aider.chat/docs/ctags.html)
that aider originally used.
Switching from ctags to tree-sitter provides a bunch of benefits:
- The map is richer, showing full function call signatures and other details straight from the source files.
- Thanks to `py-tree-sitter-languages`, we get full support for many programming languages via a python package that's automatically installed as part of the normal `pip install aider-chat`.
- We remove the requirement for users to manually install `universal-ctags` via some extenal tool or package manager (brew, apt, choco, etc).
- Tree-sitter integration is a key enabler for future work and capabilities for aider.
## Future work
You'll recall that we identified the 3 key steps
required to use GPT
to code within a large, pre-existing codebase:
1. Find the code that needs to be changed.
2. Understand how that code relates to the rest of the codebase.
3. Make the correct code change to accomplish the task.
We're now using tree-sitter to help solve the code context problem (2),
but it's also an important foundation
for future work on automatically finding all the code which
will need to be changed (1).
Right now, aider relies on the user to specify which source files
will need to be modified to complete their request.
Users manually "add files to the chat" using aider's `/add` command,
which makes those files available for GPT to modify.
This works well, but a key piece of future work is to harness the
power of GPT and tree-sitter to automatically identify
which parts of the code will need changes.
## Try it out
To code with GPT-4 using the techniques discussed here,
just install [aider](https://aider.chat/docs/install.html).
## Credits
Aider uses
[modified versions of the tags.scm files](https://github.com/paul-gauthier/aider/tree/main/aider/queries)
from these
open source tree-sitter language implementations:
* https://github.com/tree-sitter/tree-sitter-c — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-c-sharp — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-cpp — licensed under the MIT License.
* https://github.com/Wilfred/tree-sitter-elisp — licensed under the MIT License.
* https://github.com/elixir-lang/tree-sitter-elixir — licensed under the Apache License, Version 2.0.
* https://github.com/elm-tooling/tree-sitter-elm — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-go — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-java — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-javascript — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-ocaml — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-php — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-python — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-ql — licensed under the MIT License.
* https://github.com/r-lib/tree-sitter-r — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-ruby — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-rust — licensed under the MIT License.
* https://github.com/tree-sitter/tree-sitter-typescript — licensed under the MIT License.

View file

@ -34,3 +34,4 @@ jsonschema==4.17.3
sounddevice==0.4.6
soundfile==0.12.1
pathspec==0.11.2
grep-ast==0.1.1

View file

@ -17,6 +17,9 @@ setup(
version=__version__,
packages=find_packages(),
include_package_data=True,
package_data={
"aider": ["queries/*"],
},
install_requires=requirements,
python_requires=">=3.9",
entry_points={

View file

@ -10,7 +10,7 @@ from aider import models
from aider.coders import Coder
from aider.dump import dump # noqa: F401
from aider.io import InputOutput
from tests.utils import GitTemporaryDirectory
from tests.utils import ChdirTemporaryDirectory, GitTemporaryDirectory
class TestCoder(unittest.TestCase):
@ -354,20 +354,21 @@ class TestCoder(unittest.TestCase):
@patch("aider.coders.base_coder.openai.ChatCompletion.create")
def test_run_with_invalid_request_error(self, mock_chat_completion_create):
# Mock the IO object
mock_io = MagicMock()
with ChdirTemporaryDirectory():
# Mock the IO object
mock_io = MagicMock()
# Initialize the Coder object with the mocked IO and mocked repo
coder = Coder.create(models.GPT4, None, mock_io)
# Initialize the Coder object with the mocked IO and mocked repo
coder = Coder.create(models.GPT4, None, mock_io)
# Set up the mock to raise InvalidRequestError
mock_chat_completion_create.side_effect = openai.error.InvalidRequestError(
"Invalid request", "param"
)
# Set up the mock to raise InvalidRequestError
mock_chat_completion_create.side_effect = openai.error.InvalidRequestError(
"Invalid request", "param"
)
# Call the run method and assert that InvalidRequestError is raised
with self.assertRaises(openai.error.InvalidRequestError):
coder.run(with_message="hi")
# Call the run method and assert that InvalidRequestError is raised
with self.assertRaises(openai.error.InvalidRequestError):
coder.run(with_message="hi")
def test_new_file_edit_one_commit(self):
"""A new file shouldn't get pre-committed before the GPT edit commit"""

View file

@ -1,7 +1,7 @@
import os
import unittest
from unittest.mock import patch
from aider.dump import dump # noqa: F401
from aider.io import InputOutput
from aider.repomap import RepoMap
from tests.utils import IgnorantTemporaryDirectory
@ -89,33 +89,9 @@ print(my_function(3, 4))
# close the open cache files, so Windows won't error
del repo_map
def test_check_for_ctags_failure(self):
with patch("subprocess.run") as mock_run:
mock_run.side_effect = Exception("ctags not found")
repo_map = RepoMap(io=InputOutput())
self.assertFalse(repo_map.has_ctags)
def test_check_for_ctags_success(self):
with patch("subprocess.check_output") as mock_run:
mock_run.side_effect = [
(
b"Universal Ctags 0.0.0(f25b4bb7)\n Optional compiled features: +wildcards,"
b" +regex, +gnulib_fnmatch, +gnulib_regex, +iconv, +option-directory, +xpath,"
b" +json, +interactive, +yaml, +case-insensitive-filenames, +packcc,"
b" +optscript, +pcre2"
),
(
b'{"_type": "tag", "name": "status", "path": "aider/main.py", "pattern": "/^ '
b' status = main()$/", "kind": "variable"}'
),
]
repo_map = RepoMap(io=InputOutput())
self.assertTrue(repo_map.has_ctags)
def test_get_repo_map_without_ctags(self):
# Create a temporary directory with a sample Python file containing identifiers
def test_get_repo_map_all_files(self):
test_files = [
"test_file_without_ctags.py",
"test_file0.py",
"test_file1.txt",
"test_file2.md",
"test_file3.json",
@ -130,10 +106,11 @@ print(my_function(3, 4))
f.write("")
repo_map = RepoMap(root=temp_dir, io=InputOutput())
repo_map.has_ctags = False # force it off
other_files = [os.path.join(temp_dir, file) for file in test_files]
result = repo_map.get_repo_map([], other_files)
dump(other_files)
dump(repr(result))
# Check if the result contains each specific file in the expected tags map without ctags
for file in test_files: