mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 02:05:00 +00:00
feat: Add HTML content detection to scrape method
This commit is contained in:
parent
ec63642666
commit
2f4dd04164
1 changed files with 21 additions and 2 deletions
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
import pypandoc
|
import pypandoc
|
||||||
|
|
||||||
|
@ -102,13 +103,31 @@ class Scraper:
|
||||||
self.print_error(f"Failed to retrieve content from {url}")
|
self.print_error(f"Failed to retrieve content from {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Check if the content is HTML based on MIME type
|
# Check if the content is HTML based on MIME type or content
|
||||||
if mime_type and mime_type.startswith("text/html"):
|
if (mime_type and mime_type.startswith("text/html")) or (mime_type is None and self.looks_like_html(content)):
|
||||||
self.try_pandoc()
|
self.try_pandoc()
|
||||||
content = self.html_to_markdown(content)
|
content = self.html_to_markdown(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def looks_like_html(self, content):
|
||||||
|
"""
|
||||||
|
Check if the content looks like HTML.
|
||||||
|
"""
|
||||||
|
if isinstance(content, str):
|
||||||
|
# Check for common HTML tags
|
||||||
|
html_patterns = [
|
||||||
|
r'<!DOCTYPE\s+html',
|
||||||
|
r'<html',
|
||||||
|
r'<head',
|
||||||
|
r'<body',
|
||||||
|
r'<div',
|
||||||
|
r'<p>',
|
||||||
|
r'<a\s+href=',
|
||||||
|
]
|
||||||
|
return any(re.search(pattern, content, re.IGNORECASE) for pattern in html_patterns)
|
||||||
|
return False
|
||||||
|
|
||||||
# Internals...
|
# Internals...
|
||||||
def scrape_with_playwright(self, url):
|
def scrape_with_playwright(self, url):
|
||||||
import playwright
|
import playwright
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue