Implemented SSL certificate verification option in the Scraper class.

This commit is contained in:
Paul Gauthier (aider) 2024-07-22 15:18:47 +02:00
parent 5ae96231ad
commit 97e51e60fc

View file

@ -71,9 +71,10 @@ class Scraper:
playwright_instructions_shown = False playwright_instructions_shown = False
# Public API... # Public API...
def __init__(self, print_error=None, playwright_available=None): def __init__(self, print_error=None, playwright_available=None, verify_ssl=True):
""" """
`print_error` - a function to call to print error/debug info. `print_error` - a function to call to print error/debug info.
`verify_ssl` - if False, disable SSL certificate verification when scraping.
""" """
if print_error: if print_error:
self.print_error = print_error self.print_error = print_error
@ -81,6 +82,7 @@ class Scraper:
self.print_error = print self.print_error = print
self.playwright_available = playwright_available self.playwright_available = playwright_available
self.verify_ssl = verify_ssl
def scrape(self, url): def scrape(self, url):
""" """
@ -110,13 +112,13 @@ class Scraper:
with sync_playwright() as p: with sync_playwright() as p:
try: try:
browser = p.chromium.launch() browser = p.chromium.launch(ignore_https_errors=not self.verify_ssl)
except Exception as e: except Exception as e:
self.playwright_available = False self.playwright_available = False
self.print_error(e) self.print_error(e)
return return
page = browser.new_page() page = browser.new_page(ignore_https_errors=not self.verify_ssl)
user_agent = page.evaluate("navigator.userAgent") user_agent = page.evaluate("navigator.userAgent")
user_agent = user_agent.replace("Headless", "") user_agent = user_agent.replace("Headless", "")
@ -138,7 +140,7 @@ class Scraper:
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"} headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
try: try:
with httpx.Client(headers=headers) as client: with httpx.Client(headers=headers, verify=self.verify_ssl) as client:
response = client.get(url) response = client.get(url)
response.raise_for_status() response.raise_for_status()
return response.text return response.text