improved token limit err msgs and docs #678

2025-05-24 14:25:00 +00:00 · 2024-06-14 07:28:33 -07:00 · 2024-06-14 07:28:33 -07:00 · a30e656304
commit a30e656304
parent 0fc6b9beaa
3 changed files with 150 additions and 7 deletions
--- a/aider/coders/base_coder.py
+++ b/aider/coders/base_coder.py
@ -795,8 +795,11 @@ class Coder:
        except ExhaustedContextWindow:
            exhausted = True
        except litellm.exceptions.BadRequestError as err:
-            self.io.tool_error(f"BadRequestError: {err}")
+            if "ContextWindowExceededError" in err.message:
-            return
+                exhausted = True
            else:
                self.io.tool_error(f"BadRequestError: {err}")
                return
        except openai.BadRequestError as err:
            if "maximum context length" in str(err):
                exhausted = True
@ -804,12 +807,8 @@ class Coder:
                raise err
        if exhausted:
            self.show_exhausted_error()
            self.num_exhausted_context_windows += 1
            self.io.tool_error("The chat session is larger than the context window!\n")
            self.commands.cmd_tokens("")
            self.io.tool_error("\nTo reduce token usage:")
            self.io.tool_error(" - Use /drop to remove unneeded files from the chat session.")
            self.io.tool_error(" - Use /clear to clear chat history.")
            return
        if self.partial_response_function_call:
@ -878,6 +877,63 @@ class Coder:
            else:
                self.reflected_message = add_rel_files_message
    def show_exhausted_error(self):
        output_tokens = 0
        if self.partial_response_content:
            output_tokens = self.main_model.token_count(self.partial_response_content)
        max_output_tokens = self.main_model.info.get("max_output_tokens", 0)
        input_tokens = self.main_model.token_count(self.format_messages())
        max_input_tokens = self.main_model.info.get("max_input_tokens", 0)
        total_tokens = input_tokens + output_tokens
        if output_tokens >= max_output_tokens:
            out_err = " -- exceeded output limit!"
        else:
            out_err = ""
        if input_tokens >= max_input_tokens:
            inp_err = " -- context window exhausted!"
        else:
            inp_err = ""
        if total_tokens >= max_input_tokens:
            tot_err = " -- context window exhausted!"
        else:
            tot_err = ""
        res = ["", ""]
        res.append(f"Model {self.main_model.name} has hit a token limit!")
        res.append("")
        res.append(f"Input tokens: {input_tokens} of {max_input_tokens}{inp_err}")
        res.append(f"Output tokens: {output_tokens} of {max_output_tokens}{out_err}")
        res.append(f"Total tokens: {total_tokens} of {max_input_tokens}{tot_err}")
        if output_tokens >= max_output_tokens:
            res.append("")
            res.append("To reduce output tokens:")
            res.append("- Ask for smaller changes in each request.")
            res.append("- Break your code into smaller source files.")
            if "diff" not in self.main_model.edit_format:
                res.append(
                    "- Try using a stronger model like gpt-4o or opus that can return diffs."
                )
        if input_tokens >= max_input_tokens or total_tokens >= max_input_tokens:
            res.append("")
            res.append("To reduce input tokens:")
            res.append("- Use /tokens to see token usage.")
            res.append("- Use /drop to remove unneeded files from the chat session.")
            res.append("- Use /clear to clear the chat history.")
            res.append("- Break your code into smaller source files.")
        res.append("")
        res.append(f"For more info: {urls.token_limits}")
        res = "".join([line + "\n" for line in res])
        self.io.tool_error(res)
    def lint_edited(self, fnames):
        res = ""
        for fname in fnames:
--- a/aider/urls.py
+++ b/aider/urls.py
@ -5,3 +5,4 @@ git = "https://aider.chat/docs/git.html"
 enable_playwrite = "https://aider.chat/docs/install/optional.html#enable-playwright"
 favicon = "https://aider.chat/assets/icons/favicon-32x32.png"
 model_warnings = "https://aider.chat/docs/llms/warnings.html"
 token_limits = "https://aider.chat/docs/toubleshooting/token-limits.html"
--- a/website/docs/troubleshooting/token-limits.md
+++ b/website/docs/troubleshooting/token-limits.md
@ -0,0 +1,86 @@
 ---
 parent: Troubleshooting
 nav_order: 25
 ---
 # Token limits
 Every LLM has limits on how many tokens it can process:
 - The model's **context window** limits how many tokens of
 *input and output* it can process.
 - Each model has limit on how many **output tokens** it can
 produce.
 Aider will report an error if a model responds indicating that
 it has exceeded a token limit.
 The error will include suggested actions to try and
 avoid hitting token limits.
 Here's an example error:
 ```
 Model gpt-3.5-turbo has hit a token limit!
 Input tokens: 768 of 16385
 Output tokens: 4096 of 4096 -- exceeded output limit!
 Total tokens: 4864 of 16385
 To reduce output tokens:
 - Ask for smaller changes in each request.
 - Break your code into smaller source files.
 - Try using a stronger model like gpt-4o or opus that can return diffs.
 For more info: https://aider.chat/docs/token-limits.html
 ```
 ## Input tokens & context window size
 The most common problem is trying to send too much data to a 
 model,
 overflowing its context window.
 Technically you can exhaust the context window if the input is
 too large or if the input plus output are too large.
 Strong models like GPT-4o and Opus have quite
 large context windows, so this sort of error is
 typically only an issue when working with weaker models.
 The easiest solution is to try and reduce the input tokens
 by removing files from the chat.
 It's best to only add the files that aider will need to *edit*
 to complete your request.
 - Use `/tokens` to see token usage.
 - Use `/drop` to remove unneeded files from the chat session.
 - Use `/clear` to clear the chat history.
 - Break your code into smaller source files.
 ## Output token limits
 Most models have quite small output limits, often as low
 as 4k tokens.
 If you ask aider to make a large change that affects a lot
 of code, the LLM may hit output token limits
 as it tries to send back all the changes.
 To avoid hitting output token limits:
 - Ask for smaller changes in each request.
 - Break your code into smaller source files.
 - Try using a stronger model like gpt-4o or opus that can return diffs.
 ## Other causes
 Sometimes token limit errors are caused by 
 non-compliant API proxy servers
 or bugs in the API server you are using to host a local model.
 Aider has been well tested when directly connecting to 
 major 
 [LLM provider cloud APIs](https://aider.chat/docs/llms.html).
 For serving local models, 
 [Ollama](https://aider.chat/docs/llms/ollama.html) is known to work well with aider.
 Try using aider without an API proxy server
 or directly with one of the recommended cloud APIs
 and see if your token limit problems resolve.