From 2f4dd04164a02eeb85dc361e3d087dbc079d5977 Mon Sep 17 00:00:00 2001 From: "Paul Gauthier (aider)" Date: Mon, 12 Aug 2024 09:54:03 -0700 Subject: [PATCH] feat: Add HTML content detection to scrape method --- aider/scrape.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/aider/scrape.py b/aider/scrape.py index f16e0ef16..7d72b5db0 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -2,6 +2,7 @@ import re import sys +import re import pypandoc @@ -102,13 +103,31 @@ class Scraper: self.print_error(f"Failed to retrieve content from {url}") return None - # Check if the content is HTML based on MIME type - if mime_type and mime_type.startswith("text/html"): + # Check if the content is HTML based on MIME type or content + if (mime_type and mime_type.startswith("text/html")) or (mime_type is None and self.looks_like_html(content)): self.try_pandoc() content = self.html_to_markdown(content) return content + def looks_like_html(self, content): + """ + Check if the content looks like HTML. + """ + if isinstance(content, str): + # Check for common HTML tags + html_patterns = [ + r'', + r'