Estimate tokenization to speed up repo map

2025-05-31 01:35:00 +00:00 · 2024-08-05 17:09:27 -03:00 · 2024-08-05 17:09:27 -03:00 · 75c3c40354
commit 75c3c40354
parent de26f8e5e0
1 changed files with 22 additions and 3 deletions
--- a/aider/repomap.py
+++ b/aider/repomap.py
@ -29,9 +29,10 @@ def print_elapsed(message):
    current_time = time.time()
    if hasattr(print_elapsed, "last_time"):
        elapsed = current_time - print_elapsed.last_time
-        print(f"{message}: {elapsed:.2f} seconds")
+        print(f"... {elapsed:.2f} seconds")
+        print(message)
    else:
-        print(f"{message}: (first measurement)")
+        print(f"{message}:")
    print_elapsed.last_time = current_time


@ -43,6 +44,8 @@ class RepoMap:

    warned_files = set()

+    tokens_per_char = None
+
    def __init__(
        self,
        map_tokens=1024,
@ -67,9 +70,22 @@ class RepoMap:
        self.map_mul_no_files = map_mul_no_files
        self.max_context_window = max_context_window

-        self.token_count = main_model.token_count
        self.repo_content_prefix = repo_content_prefix

+        self.main_model = main_model
+
+    def token_count(self, text):
+        if self.tokens_per_char:
+            return len(text) / self.tokens_per_char
+
+        sample_text = text.splitlines(keepends=True)
+        sample_text = "".join(random.sample(sample_text, 150))
+        tokens = self.main_model.token_count(sample_text)
+        self.tokens_per_char = tokens / len(sample_text)
+
+        return len(text) / self.tokens_per_char
+        return tokens
+
    def get_repo_map(self, chat_files, other_files, mentioned_fnames=None, mentioned_idents=None):
        if self.max_map_tokens <= 0:
            return
@ -80,6 +96,9 @@ class RepoMap:
        if not mentioned_idents:
            mentioned_idents = set()

+        # reset the estimate
+        self.tokens_per_char = None
+
        max_map_tokens = self.max_map_tokens

        # With no files in the chat, give a bigger view of the entire repo