Roughly working scraper

This commit is contained in:
Paul Gauthier 2024-02-08 11:44:55 -08:00
parent 9bf3a6e0c6
commit 5b78d929a6
6 changed files with 77 additions and 208 deletions

View file

@ -1,19 +1,22 @@
#!/usr/bin/env python
import sys
from aider import __version__
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
aider_user_agent= f'Aider/{__version__} https://aider.chat'
from aider import __version__
PLAYWRIGHT_INFO = '''
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
PLAYWRIGHT_INFO = """
For better web scraping, install Playwright chromium:
playwright install --with-deps chromium
See https://aider.chat/docs/install.html#enable-playwright for more info.
'''
"""
class Scraper:
playwright_available = None
@ -29,15 +32,16 @@ class Scraper:
try:
browser = p.chromium.launch()
except Exception as e:
print(repr(e))
self.playwright_available = False
self.print_error(e)
return
page = browser.new_page()
user_agent = page.evaluate("navigator.userAgent")
user_agent = user_agent.replace('Headless','')
user_agent = user_agent.replace('headless', '')
user_agent += ' ' + aider_user_agent
user_agent = user_agent.replace("Headless", "")
user_agent = user_agent.replace("headless", "")
user_agent += " " + aider_user_agent
page = browser.new_page(user_agent=user_agent)
page.goto(url)
@ -49,26 +53,25 @@ class Scraper:
def try_playwright(self):
with sync_playwright() as p:
try:
browser = p.chromium.launch()
p.chromium.launch()
self.playwright_available = True
except Exception as e:
except Exception:
self.playwright_available = False
self.print_error(PLAYWRIGHT_INFO)
def scrape_with_httpx(self, url):
import httpx
headers = {
'User-Agent': aider_user_agent
}
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
try:
with httpx.Client(headers=headers) as client:
response = client.get(url)
response.raise_for_status()
return response.text
except httpx.HTTPError as http_err:
self.print_error(f'HTTP error occurred: {http_err}')
self.print_error(f"HTTP error occurred: {http_err}")
except Exception as err:
self.print_error(f'An error occurred: {err}')
self.print_error(f"An error occurred: {err}")
return None
def scrape(self, url):
@ -80,13 +83,35 @@ class Scraper:
else:
content = self.scrape_with_httpx(url)
content = html_to_text(content)
return content
# Adapted from AutoGPT, MIT License
#
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
def html_to_text(page_source: str) -> str:
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
def main(url):
scraper = Scraper()
content = scraper.scrape(url)
print(content)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python playw.py <URL>")

View file

@ -1,35 +1,13 @@
import os
import tempfile
from pathlib import Path
from typing import Type
import git
from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeDriverService
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.options import ArgOptions as BrowserOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.edge.service import Service as EdgeDriverService
from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as GeckoDriverService
from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.safari.options import Options as SafariOptions
from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
from aider.dump import dump # noqa: F401
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
class IgnorantTemporaryDirectory:
def __init__(self):
@ -139,104 +117,3 @@ def is_gpt4_with_openai_base_url(model_name, client):
if client is None or not hasattr(client, "base_url"):
return False
return model_name.startswith("gpt-4") and "api.openai.com" in client.base_url.host
# Taken from AutoGPT, MIT License
def open_page_in_browser(
url: str,
selenium_web_browser="chrome",
selenium_headless=True,
platform="linux",
user_agent="Aider CLI 0.23.0",
) -> WebDriver:
"""Open a browser window and load a web page using Selenium
Params:
url (str): The URL of the page to load
config (Config): The applicable application configuration
Returns:
driver (WebDriver): A driver object representing the browser window to scrape
"""
options_available: dict[str, Type[BrowserOptions]] = {
"chrome": ChromeOptions,
"edge": EdgeOptions,
"firefox": FirefoxOptions,
"safari": SafariOptions,
}
options: BrowserOptions = options_available[selenium_web_browser]()
options.add_argument(f"user-agent={user_agent}")
if selenium_web_browser == "firefox":
if selenium_headless:
options.headless = True
options.add_argument("--disable-gpu")
driver = FirefoxDriver(
service=GeckoDriverService(GeckoDriverManager().install()), options=options
)
elif selenium_web_browser == "edge":
driver = EdgeDriver(
service=EdgeDriverService(EdgeDriverManager().install()), options=options
)
elif selenium_web_browser == "safari":
# Requires a bit more setup on the users end.
# See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari # noqa: E501
driver = SafariDriver(options=options)
else:
if platform == "linux" or platform == "linux2":
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--no-sandbox")
if selenium_headless:
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
chromium_driver_path = Path("/usr/bin/chromedriver")
driver = ChromeDriver(
service=(
ChromeDriverService(str(chromium_driver_path))
if chromium_driver_path.exists()
else ChromeDriverService(ChromeDriverManager().install())
),
options=options,
)
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
return driver
# Taken from AutoGPT, MIT License
def scrape_text_with_selenium(driver: WebDriver) -> str:
"""Scrape text from a browser window using selenium
Args:
driver (WebDriver): A driver object representing the browser window to scrape
Returns:
str: the text scraped from the website
"""
# Get the HTML content directly from the browser's DOM
page_source = driver.execute_script("return document.body.outerHTML;")
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
def scrape(url: str):
driver = open_page_in_browser(url)
text = scrape_text_with_selenium(driver)
driver.quit()
return text