Handle SSL certificate errors in the Playwright-based web scraper

This commit is contained in:
Paul Gauthier (aider) 2024-07-28 16:35:00 -03:00
parent 172af4ea57
commit 0f2aa62e80

View file

@ -119,24 +119,30 @@ class Scraper:
self.print_error(str(e)) self.print_error(str(e))
return return
page = browser.new_page(ignore_https_errors=not self.verify_ssl)
user_agent = page.evaluate("navigator.userAgent")
user_agent = user_agent.replace("Headless", "")
user_agent = user_agent.replace("headless", "")
user_agent += " " + aider_user_agent
page = browser.new_page(user_agent=user_agent)
try: try:
page.goto(url, wait_until="networkidle", timeout=5000) context = browser.new_context(ignore_https_errors=not self.verify_ssl)
except playwright._impl._errors.TimeoutError: page = context.new_page()
pass
user_agent = page.evaluate("navigator.userAgent")
try: user_agent = user_agent.replace("Headless", "")
content = page.content() user_agent = user_agent.replace("headless", "")
except playwright._impl._errors.Error as e: user_agent += " " + aider_user_agent
self.print_error(f"Error retrieving page content: {str(e)}")
content = None page.set_extra_http_headers({"User-Agent": user_agent})
try:
page.goto(url, wait_until="networkidle", timeout=5000)
except playwright._impl._errors.TimeoutError:
self.print_error(f"Timeout while loading {url}")
except playwright._impl._errors.Error as e:
self.print_error(f"Error navigating to {url}: {str(e)}")
return None
try:
content = page.content()
except playwright._impl._errors.Error as e:
self.print_error(f"Error retrieving page content: {str(e)}")
content = None
finally: finally:
browser.close() browser.close()