Handle SSL certificate errors in the Playwright-based web scraper

This commit is contained in:
Paul Gauthier (aider) 2024-07-28 16:35:00 -03:00
parent 172af4ea57
commit 0f2aa62e80

View file

@ -119,24 +119,30 @@ class Scraper:
self.print_error(str(e))
return
page = browser.new_page(ignore_https_errors=not self.verify_ssl)
user_agent = page.evaluate("navigator.userAgent")
user_agent = user_agent.replace("Headless", "")
user_agent = user_agent.replace("headless", "")
user_agent += " " + aider_user_agent
page = browser.new_page(user_agent=user_agent)
try:
page.goto(url, wait_until="networkidle", timeout=5000)
except playwright._impl._errors.TimeoutError:
pass
try:
content = page.content()
except playwright._impl._errors.Error as e:
self.print_error(f"Error retrieving page content: {str(e)}")
content = None
context = browser.new_context(ignore_https_errors=not self.verify_ssl)
page = context.new_page()
user_agent = page.evaluate("navigator.userAgent")
user_agent = user_agent.replace("Headless", "")
user_agent = user_agent.replace("headless", "")
user_agent += " " + aider_user_agent
page.set_extra_http_headers({"User-Agent": user_agent})
try:
page.goto(url, wait_until="networkidle", timeout=5000)
except playwright._impl._errors.TimeoutError:
self.print_error(f"Timeout while loading {url}")
except playwright._impl._errors.Error as e:
self.print_error(f"Error navigating to {url}: {str(e)}")
return None
try:
content = page.content()
except playwright._impl._errors.Error as e:
self.print_error(f"Error retrieving page content: {str(e)}")
content = None
finally:
browser.close()