mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 01:35:00 +00:00
feat: Implement MIME type detection in scrape methods
This commit is contained in:
parent
c0982af02c
commit
dfe2359a86
1 changed files with 12 additions and 10 deletions
|
@ -94,16 +94,16 @@ class Scraper:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.playwright_available:
|
if self.playwright_available:
|
||||||
content = self.scrape_with_playwright(url)
|
content, mime_type = self.scrape_with_playwright(url)
|
||||||
else:
|
else:
|
||||||
content = self.scrape_with_httpx(url)
|
content, mime_type = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
self.print_error(f"Failed to retrieve content from {url}")
|
self.print_error(f"Failed to retrieve content from {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Check if the content is HTML
|
# Check if the content is HTML based on MIME type
|
||||||
if content.strip().startswith(('<html', '<!DOCTYPE html')):
|
if mime_type and mime_type.startswith('text/html'):
|
||||||
self.try_pandoc()
|
self.try_pandoc()
|
||||||
content = self.html_to_markdown(content)
|
content = self.html_to_markdown(content)
|
||||||
|
|
||||||
|
@ -120,7 +120,7 @@ class Scraper:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.playwright_available = False
|
self.playwright_available = False
|
||||||
self.print_error(str(e))
|
self.print_error(str(e))
|
||||||
return
|
return None, None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
context = browser.new_context(ignore_https_errors=not self.verify_ssl)
|
context = browser.new_context(ignore_https_errors=not self.verify_ssl)
|
||||||
|
@ -134,22 +134,24 @@ class Scraper:
|
||||||
page.set_extra_http_headers({"User-Agent": user_agent})
|
page.set_extra_http_headers({"User-Agent": user_agent})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
page.goto(url, wait_until="networkidle", timeout=5000)
|
response = page.goto(url, wait_until="networkidle", timeout=5000)
|
||||||
except playwright._impl._errors.TimeoutError:
|
except playwright._impl._errors.TimeoutError:
|
||||||
self.print_error(f"Timeout while loading {url}")
|
self.print_error(f"Timeout while loading {url}")
|
||||||
except playwright._impl._errors.Error as e:
|
except playwright._impl._errors.Error as e:
|
||||||
self.print_error(f"Error navigating to {url}: {str(e)}")
|
self.print_error(f"Error navigating to {url}: {str(e)}")
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content = page.content()
|
content = page.content()
|
||||||
|
mime_type = response.header_value("content-type").split(';')[0]
|
||||||
except playwright._impl._errors.Error as e:
|
except playwright._impl._errors.Error as e:
|
||||||
self.print_error(f"Error retrieving page content: {str(e)}")
|
self.print_error(f"Error retrieving page content: {str(e)}")
|
||||||
content = None
|
content = None
|
||||||
|
mime_type = None
|
||||||
finally:
|
finally:
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
||||||
return content
|
return content, mime_type
|
||||||
|
|
||||||
def scrape_with_httpx(self, url):
|
def scrape_with_httpx(self, url):
|
||||||
import httpx
|
import httpx
|
||||||
|
@ -159,12 +161,12 @@ class Scraper:
|
||||||
with httpx.Client(headers=headers, verify=self.verify_ssl) as client:
|
with httpx.Client(headers=headers, verify=self.verify_ssl) as client:
|
||||||
response = client.get(url)
|
response = client.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.text
|
return response.text, response.headers.get('content-type', '').split(';')[0]
|
||||||
except httpx.HTTPError as http_err:
|
except httpx.HTTPError as http_err:
|
||||||
self.print_error(f"HTTP error occurred: {http_err}")
|
self.print_error(f"HTTP error occurred: {http_err}")
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.print_error(f"An error occurred: {err}")
|
self.print_error(f"An error occurred: {err}")
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
def try_pandoc(self):
|
def try_pandoc(self):
|
||||||
if self.pandoc_available:
|
if self.pandoc_available:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue