From 547ae142ba3bf404c49efa0e4ca0615bcac9310b Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Fri, 19 Apr 2024 12:08:35 -0700 Subject: [PATCH] refactor tokenizer --- aider/commands.py | 1 - aider/history.py | 7 +++---- aider/repomap.py | 5 +---- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/aider/commands.py b/aider/commands.py index 232f07d14..44804b9eb 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -27,7 +27,6 @@ class Commands: voice_language = None self.voice_language = voice_language - self.tokenizer = coder.main_model.tokenizer def cmd_web(self, args): "Use headless selenium to scrape a webpage and add the content to the chat" diff --git a/aider/history.py b/aider/history.py index 3acd368b5..9817a85cb 100644 --- a/aider/history.py +++ b/aider/history.py @@ -1,5 +1,4 @@ import argparse -import json from aider import models, prompts from aider.dump import dump # noqa: F401 @@ -8,7 +7,7 @@ from aider.sendchat import simple_send_with_retries class ChatSummary: def __init__(self, model=None, max_tokens=1024): - self.tokenizer = model.tokenizer + self.token_count = model.token_count self.max_tokens = max_tokens self.model = model @@ -20,7 +19,7 @@ class ChatSummary: def tokenize(self, messages): sized = [] for msg in messages: - tokens = len(self.tokenizer(json.dumps(msg))) + tokens = self.token_count(msg) sized.append((tokens, msg)) return sized @@ -60,7 +59,7 @@ class ChatSummary: summary = self.summarize_all(head) tail_tokens = sum(tokens for tokens, msg in sized[split_index:]) - summary_tokens = len(self.tokenizer(json.dumps(summary))) + summary_tokens = self.token_count(summary) result = summary + tail if summary_tokens + tail_tokens < self.max_tokens: diff --git a/aider/repomap.py b/aider/repomap.py index ea2f8bdc0..32773e874 100644 --- a/aider/repomap.py +++ b/aider/repomap.py @@ -52,7 +52,7 @@ class RepoMap: self.max_map_tokens = map_tokens - self.tokenizer = main_model.tokenizer + self.token_count = main_model.token_count self.repo_content_prefix = repo_content_prefix def get_repo_map(self, chat_files, other_files): @@ -89,9 +89,6 @@ class RepoMap: return repo_content - def token_count(self, string): - return len(self.tokenizer(string)) - def get_rel_fname(self, fname): return os.path.relpath(fname, self.root)