From 2f4dd04164a02eeb85dc361e3d087dbc079d5977 Mon Sep 17 00:00:00 2001
From: "Paul Gauthier (aider)" <aider@paulg.org>
Date: Mon, 12 Aug 2024 09:54:03 -0700
Subject: [PATCH] feat: Add HTML content detection to scrape method

---
 aider/scrape.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/aider/scrape.py b/aider/scrape.py
index f16e0ef16..7d72b5db0 100755
--- a/aider/scrape.py
+++ b/aider/scrape.py
@@ -2,6 +2,7 @@
 
 import re
 import sys
+import re
 
 import pypandoc
 
@@ -102,13 +103,31 @@ class Scraper:
             self.print_error(f"Failed to retrieve content from {url}")
             return None
 
-        # Check if the content is HTML based on MIME type
-        if mime_type and mime_type.startswith("text/html"):
+        # Check if the content is HTML based on MIME type or content
+        if (mime_type and mime_type.startswith("text/html")) or (mime_type is None and self.looks_like_html(content)):
             self.try_pandoc()
             content = self.html_to_markdown(content)
 
         return content
 
+    def looks_like_html(self, content):
+        """
+        Check if the content looks like HTML.
+        """
+        if isinstance(content, str):
+            # Check for common HTML tags
+            html_patterns = [
+                r'<!DOCTYPE\s+html',
+                r'<html',
+                r'<head',
+                r'<body',
+                r'<div',
+                r'<p>',
+                r'<a\s+href=',
+            ]
+            return any(re.search(pattern, content, re.IGNORECASE) for pattern in html_patterns)
+        return False
+
     # Internals...
     def scrape_with_playwright(self, url):
         import playwright