diff --git a/aider/scrape.py b/aider/scrape.py index f16e0ef16..7d72b5db0 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -2,6 +2,7 @@ import re import sys +import re import pypandoc @@ -102,13 +103,31 @@ class Scraper: self.print_error(f"Failed to retrieve content from {url}") return None - # Check if the content is HTML based on MIME type - if mime_type and mime_type.startswith("text/html"): + # Check if the content is HTML based on MIME type or content + if (mime_type and mime_type.startswith("text/html")) or (mime_type is None and self.looks_like_html(content)): self.try_pandoc() content = self.html_to_markdown(content) return content + def looks_like_html(self, content): + """ + Check if the content looks like HTML. + """ + if isinstance(content, str): + # Check for common HTML tags + html_patterns = [ + r'', + r'