Merge pull request #379 from joshuavial/gpt4-vision

add support for gpt-4-vision
2025-05-25 14:55:00 +00:00 · 2024-01-02 13:37:17 -04:00 · 2024-01-02 13:37:17 -04:00 · 2824acb5c2
commit 2824acb5c2
parent 2241741316 93f32d3855
13 changed files with 197 additions and 37 deletions
--- a/aider/commands.py
+++ b/aider/commands.py
@ -10,6 +10,7 @@ from aider import prompts, voice

 from .dump import dump  # noqa: F401

+from aider.utils import is_image_file, is_gpt4_with_openai_base_url

 class Commands:
    voice = None
@ -138,9 +139,12 @@ class Commands:
        for fname in self.coder.abs_fnames:
            relative_fname = self.coder.get_rel_fname(fname)
            content = self.io.read_text(fname)
-            # approximate
-            content = f"{relative_fname}\n```\n" + content + "```\n"
-            tokens = self.coder.main_model.token_count(content)
+            if is_image_file(relative_fname):
+                tokens = self.coder.main_model.token_count_for_image(fname)
+            else:
+                # approximate
+                content = f"{relative_fname}\n```\n" + content + "```\n"
+                tokens = self.coder.main_model.token_count(content)
            res.append((tokens, f"{relative_fname}", "use /drop to drop from chat"))

        self.io.tool_output("Approximate context window usage, in tokens:")
@ -167,7 +171,12 @@ class Commands:
        self.io.tool_output("=" * (width + cost_width + 1))
        self.io.tool_output(f"${total_cost:5.2f} {fmt(total)} tokens total")

-        limit = self.coder.main_model.max_context_tokens
+        # only switch to image model token count if gpt4 and openai and image in files
+        image_in_chat = False
+        if is_gpt4_with_openai_base_url(self.coder.main_model.name, self.coder.client):
+            image_in_chat = any(is_image_file(relative_fname) for relative_fname in self.coder.get_inchat_relative_files())
+        limit = 128000 if image_in_chat else self.coder.main_model.max_context_tokens
+
        remaining = limit - total
        if remaining > 1024:
            self.io.tool_output(f"{cost_pad}{fmt(remaining)} tokens remaining in context window")
@ -324,6 +333,9 @@ class Commands:
            if abs_file_path in self.coder.abs_fnames:
                self.io.tool_error(f"{matched_file} is already in the chat")
            else:
+                if is_image_file(matched_file) and not is_gpt4_with_openai_base_url(self.coder.main_model.name, self.coder.client):
+                    self.io.tool_error(f"Cannot add image file {matched_file} as the model does not support image files")
+                    continue
                content = self.io.read_text(abs_file_path)
                if content is None:
                    self.io.tool_error(f"Unable to read {matched_file}")