From a30e656304250206d14c41f99d34740e166768b1 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Fri, 14 Jun 2024 07:28:33 -0700
Subject: [PATCH] improved token limit err msgs and docs #678

---
 aider/coders/base_coder.py                   | 70 ++++++++++++++--
 aider/urls.py                                |  1 +
 website/docs/troubleshooting/token-limits.md | 86 ++++++++++++++++++++
 3 files changed, 150 insertions(+), 7 deletions(-)
 create mode 100644 website/docs/troubleshooting/token-limits.md

diff --git a/aider/coders/base_coder.py b/aider/coders/base_coder.py
index 16dc55c78..b704841fa 100755
--- a/aider/coders/base_coder.py
+++ b/aider/coders/base_coder.py
@@ -795,8 +795,11 @@ class Coder:
         except ExhaustedContextWindow:
             exhausted = True
         except litellm.exceptions.BadRequestError as err:
-            self.io.tool_error(f"BadRequestError: {err}")
-            return
+            if "ContextWindowExceededError" in err.message:
+                exhausted = True
+            else:
+                self.io.tool_error(f"BadRequestError: {err}")
+                return
         except openai.BadRequestError as err:
             if "maximum context length" in str(err):
                 exhausted = True
@@ -804,12 +807,8 @@ class Coder:
                 raise err
 
         if exhausted:
+            self.show_exhausted_error()
             self.num_exhausted_context_windows += 1
-            self.io.tool_error("The chat session is larger than the context window!\n")
-            self.commands.cmd_tokens("")
-            self.io.tool_error("\nTo reduce token usage:")
-            self.io.tool_error(" - Use /drop to remove unneeded files from the chat session.")
-            self.io.tool_error(" - Use /clear to clear chat history.")
             return
 
         if self.partial_response_function_call:
@@ -878,6 +877,63 @@ class Coder:
             else:
                 self.reflected_message = add_rel_files_message
 
+    def show_exhausted_error(self):
+        output_tokens = 0
+        if self.partial_response_content:
+            output_tokens = self.main_model.token_count(self.partial_response_content)
+        max_output_tokens = self.main_model.info.get("max_output_tokens", 0)
+
+        input_tokens = self.main_model.token_count(self.format_messages())
+        max_input_tokens = self.main_model.info.get("max_input_tokens", 0)
+
+        total_tokens = input_tokens + output_tokens
+
+        if output_tokens >= max_output_tokens:
+            out_err = " -- exceeded output limit!"
+        else:
+            out_err = ""
+
+        if input_tokens >= max_input_tokens:
+            inp_err = " -- context window exhausted!"
+        else:
+            inp_err = ""
+
+        if total_tokens >= max_input_tokens:
+            tot_err = " -- context window exhausted!"
+        else:
+            tot_err = ""
+
+        res = ["", ""]
+        res.append(f"Model {self.main_model.name} has hit a token limit!")
+        res.append("")
+        res.append(f"Input tokens: {input_tokens} of {max_input_tokens}{inp_err}")
+        res.append(f"Output tokens: {output_tokens} of {max_output_tokens}{out_err}")
+        res.append(f"Total tokens: {total_tokens} of {max_input_tokens}{tot_err}")
+
+        if output_tokens >= max_output_tokens:
+            res.append("")
+            res.append("To reduce output tokens:")
+            res.append("- Ask for smaller changes in each request.")
+            res.append("- Break your code into smaller source files.")
+            if "diff" not in self.main_model.edit_format:
+                res.append(
+                    "- Try using a stronger model like gpt-4o or opus that can return diffs."
+                )
+
+        if input_tokens >= max_input_tokens or total_tokens >= max_input_tokens:
+            res.append("")
+            res.append("To reduce input tokens:")
+            res.append("- Use /tokens to see token usage.")
+            res.append("- Use /drop to remove unneeded files from the chat session.")
+            res.append("- Use /clear to clear the chat history.")
+            res.append("- Break your code into smaller source files.")
+
+        res.append("")
+        res.append(f"For more info: {urls.token_limits}")
+
+        res = "".join([line + "\n" for line in res])
+        self.io.tool_error(res)
+
     def lint_edited(self, fnames):
         res = ""
         for fname in fnames:
diff --git a/aider/urls.py b/aider/urls.py
index 62f95503d..5e2c7761e 100644
--- a/aider/urls.py
+++ b/aider/urls.py
@@ -5,3 +5,4 @@ git = "https://aider.chat/docs/git.html"
 enable_playwrite = "https://aider.chat/docs/install/optional.html#enable-playwright"
 favicon = "https://aider.chat/assets/icons/favicon-32x32.png"
 model_warnings = "https://aider.chat/docs/llms/warnings.html"
+token_limits = "https://aider.chat/docs/toubleshooting/token-limits.html"
diff --git a/website/docs/troubleshooting/token-limits.md b/website/docs/troubleshooting/token-limits.md
new file mode 100644
index 000000000..ec4bb632e
--- /dev/null
+++ b/website/docs/troubleshooting/token-limits.md
@@ -0,0 +1,86 @@
+---
+parent: Troubleshooting
+nav_order: 25
+---
+
+# Token limits
+
+Every LLM has limits on how many tokens it can process:
+
+- The model's **context window** limits how many tokens of
+*input and output* it can process.
+- Each model has limit on how many **output tokens** it can
+produce.
+
+Aider will report an error if a model responds indicating that
+it has exceeded a token limit.
+The error will include suggested actions to try and
+avoid hitting token limits.
+Here's an example error:
+
+```
+Model gpt-3.5-turbo has hit a token limit!
+
+Input tokens: 768 of 16385
+Output tokens: 4096 of 4096 -- exceeded output limit!
+Total tokens: 4864 of 16385
+
+To reduce output tokens:
+- Ask for smaller changes in each request.
+- Break your code into smaller source files.
+- Try using a stronger model like gpt-4o or opus that can return diffs.
+
+For more info: https://aider.chat/docs/token-limits.html
+```
+
+## Input tokens & context window size
+
+The most common problem is trying to send too much data to a 
+model,
+overflowing its context window.
+Technically you can exhaust the context window if the input is
+too large or if the input plus output are too large.
+
+Strong models like GPT-4o and Opus have quite
+large context windows, so this sort of error is
+typically only an issue when working with weaker models.
+
+The easiest solution is to try and reduce the input tokens
+by removing files from the chat.
+It's best to only add the files that aider will need to *edit*
+to complete your request.
+
+- Use `/tokens` to see token usage.
+- Use `/drop` to remove unneeded files from the chat session.
+- Use `/clear` to clear the chat history.
+- Break your code into smaller source files.
+
+## Output token limits
+
+Most models have quite small output limits, often as low
+as 4k tokens.
+If you ask aider to make a large change that affects a lot
+of code, the LLM may hit output token limits
+as it tries to send back all the changes.
+
+To avoid hitting output token limits:
+
+- Ask for smaller changes in each request.
+- Break your code into smaller source files.
+- Try using a stronger model like gpt-4o or opus that can return diffs.
+
+
+## Other causes
+
+Sometimes token limit errors are caused by 
+non-compliant API proxy servers
+or bugs in the API server you are using to host a local model.
+Aider has been well tested when directly connecting to 
+major 
+[LLM provider cloud APIs](https://aider.chat/docs/llms.html).
+For serving local models, 
+[Ollama](https://aider.chat/docs/llms/ollama.html) is known to work well with aider.
+
+Try using aider without an API proxy server
+or directly with one of the recommended cloud APIs
+and see if your token limit problems resolve.