feat: Add HTML content detection to scrape method

2025-05-30 17:24:59 +00:00 · 2024-08-12 09:54:03 -07:00 · 2024-08-12 09:54:03 -07:00 · 2f4dd04164
commit 2f4dd04164
parent ec63642666
1 changed files with 21 additions and 2 deletions
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -2,6 +2,7 @@

 import re
 import sys
+import re

 import pypandoc

@ -102,13 +103,31 @@ class Scraper:
            self.print_error(f"Failed to retrieve content from {url}")
            return None

-        # Check if the content is HTML based on MIME type
-        if mime_type and mime_type.startswith("text/html"):
+        # Check if the content is HTML based on MIME type or content
+        if (mime_type and mime_type.startswith("text/html")) or (mime_type is None and self.looks_like_html(content)):
            self.try_pandoc()
            content = self.html_to_markdown(content)

        return content

+    def looks_like_html(self, content):
+        """
+        Check if the content looks like HTML.
+        """
+        if isinstance(content, str):
+            # Check for common HTML tags
+            html_patterns = [
+                r'<!DOCTYPE\s+html',
+                r'<html',
+                r'<head',
+                r'<body',
+                r'<div',
+                r'<p>',
+                r'<a\s+href=',
+            ]
+            return any(re.search(pattern, content, re.IGNORECASE) for pattern in html_patterns)
+        return False
+
    # Internals...
    def scrape_with_playwright(self, url):
        import playwright