Roughly working scraper

2025-05-28 08:14:59 +00:00 · 2024-02-08 11:44:55 -08:00 · 2024-02-08 11:44:55 -08:00 · 5b78d929a6
commit 5b78d929a6
parent 9bf3a6e0c6
6 changed files with 77 additions and 208 deletions
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -1,19 +1,22 @@
 #!/usr/bin/env python

 import sys
-from aider import __version__

+from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright

-aider_user_agent= f'Aider/{__version__} https://aider.chat'
+from aider import __version__

-PLAYWRIGHT_INFO = '''
+aider_user_agent = f"Aider/{__version__} +https://aider.chat"
+
+PLAYWRIGHT_INFO = """
 For better web scraping, install Playwright chromium:

    playwright install --with-deps chromium

 See https://aider.chat/docs/install.html#enable-playwright for more info.
-'''
+"""
+

 class Scraper:
    playwright_available = None
@ -29,15 +32,16 @@ class Scraper:
            try:
                browser = p.chromium.launch()
            except Exception as e:
-                print(repr(e))
+                self.playwright_available = False
+                self.print_error(e)
                return

            page = browser.new_page()

            user_agent = page.evaluate("navigator.userAgent")
-            user_agent = user_agent.replace('Headless','')
-            user_agent = user_agent.replace('headless', '')
-            user_agent += ' ' + aider_user_agent
+            user_agent = user_agent.replace("Headless", "")
+            user_agent = user_agent.replace("headless", "")
+            user_agent += " " + aider_user_agent

            page = browser.new_page(user_agent=user_agent)
            page.goto(url)
@ -49,26 +53,25 @@ class Scraper:
    def try_playwright(self):
        with sync_playwright() as p:
            try:
-                browser = p.chromium.launch()
+                p.chromium.launch()
                self.playwright_available = True
-            except Exception as e:
+            except Exception:
                self.playwright_available = False
                self.print_error(PLAYWRIGHT_INFO)

    def scrape_with_httpx(self, url):
        import httpx
-        headers = {
-            'User-Agent': aider_user_agent
-        }
+
+        headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
        try:
            with httpx.Client(headers=headers) as client:
                response = client.get(url)
                response.raise_for_status()
                return response.text
        except httpx.HTTPError as http_err:
-            self.print_error(f'HTTP error occurred: {http_err}')
+            self.print_error(f"HTTP error occurred: {http_err}")
        except Exception as err:
-            self.print_error(f'An error occurred: {err}')
+            self.print_error(f"An error occurred: {err}")
        return None

    def scrape(self, url):
@ -80,13 +83,35 @@ class Scraper:
        else:
            content = self.scrape_with_httpx(url)

+        content = html_to_text(content)
+
        return content

+
+# Adapted from AutoGPT, MIT License
+#
+# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
+
+
+def html_to_text(page_source: str) -> str:
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = "\n".join(chunk for chunk in chunks if chunk)
+    return text
+
+
 def main(url):
    scraper = Scraper()
    content = scraper.scrape(url)
    print(content)

+
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python playw.py <URL>")
--- a/aider/utils.py
+++ b/aider/utils.py
@ -1,35 +1,13 @@
 import os
 import tempfile
 from pathlib import Path
-from typing import Type

 import git
-from bs4 import BeautifulSoup
-from selenium.common.exceptions import WebDriverException
-from selenium.webdriver.chrome.options import Options as ChromeOptions
-from selenium.webdriver.chrome.service import Service as ChromeDriverService
-from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.common.options import ArgOptions as BrowserOptions
-from selenium.webdriver.edge.options import Options as EdgeOptions
-from selenium.webdriver.edge.service import Service as EdgeDriverService
-from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
-from selenium.webdriver.firefox.options import Options as FirefoxOptions
-from selenium.webdriver.firefox.service import Service as GeckoDriverService
-from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver
-from selenium.webdriver.remote.webdriver import WebDriver
-from selenium.webdriver.safari.options import Options as SafariOptions
-from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.wait import WebDriverWait
-from webdriver_manager.chrome import ChromeDriverManager
-from webdriver_manager.firefox import GeckoDriverManager
-from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
-
-IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}

 from aider.dump import dump  # noqa: F401

+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
+

 class IgnorantTemporaryDirectory:
    def __init__(self):
@ -139,104 +117,3 @@ def is_gpt4_with_openai_base_url(model_name, client):
    if client is None or not hasattr(client, "base_url"):
        return False
    return model_name.startswith("gpt-4") and "api.openai.com" in client.base_url.host
-
-
-# Taken from AutoGPT, MIT License
-def open_page_in_browser(
-    url: str,
-    selenium_web_browser="chrome",
-    selenium_headless=True,
-    platform="linux",
-    user_agent="Aider CLI 0.23.0",
-) -> WebDriver:
-    """Open a browser window and load a web page using Selenium
-
-    Params:
-        url (str): The URL of the page to load
-        config (Config): The applicable application configuration
-
-    Returns:
-        driver (WebDriver): A driver object representing the browser window to scrape
-    """
-    options_available: dict[str, Type[BrowserOptions]] = {
-        "chrome": ChromeOptions,
-        "edge": EdgeOptions,
-        "firefox": FirefoxOptions,
-        "safari": SafariOptions,
-    }
-
-    options: BrowserOptions = options_available[selenium_web_browser]()
-    options.add_argument(f"user-agent={user_agent}")
-
-    if selenium_web_browser == "firefox":
-        if selenium_headless:
-            options.headless = True
-            options.add_argument("--disable-gpu")
-        driver = FirefoxDriver(
-            service=GeckoDriverService(GeckoDriverManager().install()), options=options
-        )
-    elif selenium_web_browser == "edge":
-        driver = EdgeDriver(
-            service=EdgeDriverService(EdgeDriverManager().install()), options=options
-        )
-    elif selenium_web_browser == "safari":
-        # Requires a bit more setup on the users end.
-        # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari  # noqa: E501
-        driver = SafariDriver(options=options)
-    else:
-        if platform == "linux" or platform == "linux2":
-            options.add_argument("--disable-dev-shm-usage")
-            options.add_argument("--remote-debugging-port=9222")
-
-        options.add_argument("--no-sandbox")
-        if selenium_headless:
-            options.add_argument("--headless=new")
-            options.add_argument("--disable-gpu")
-
-        chromium_driver_path = Path("/usr/bin/chromedriver")
-
-        driver = ChromeDriver(
-            service=(
-                ChromeDriverService(str(chromium_driver_path))
-                if chromium_driver_path.exists()
-                else ChromeDriverService(ChromeDriverManager().install())
-            ),
-            options=options,
-        )
-    driver.get(url)
-
-    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-
-    return driver
-
-
-# Taken from AutoGPT, MIT License
-def scrape_text_with_selenium(driver: WebDriver) -> str:
-    """Scrape text from a browser window using selenium
-
-    Args:
-        driver (WebDriver): A driver object representing the browser window to scrape
-
-    Returns:
-        str: the text scraped from the website
-    """
-
-    # Get the HTML content directly from the browser's DOM
-    page_source = driver.execute_script("return document.body.outerHTML;")
-    soup = BeautifulSoup(page_source, "html.parser")
-
-    for script in soup(["script", "style"]):
-        script.extract()
-
-    text = soup.get_text()
-    lines = (line.strip() for line in text.splitlines())
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    text = "\n".join(chunk for chunk in chunks if chunk)
-    return text
-
-
-def scrape(url: str):
-    driver = open_page_in_browser(url)
-    text = scrape_text_with_selenium(driver)
-    driver.quit()
-    return text
--- a/docs/install.md
+++ b/docs/install.md
@ -77,6 +77,28 @@ Put a line in it like this to specify your api key:
 openai-api-key: sk-...
 ```

+## Enable Playwright
+
+Aider supports adding web pages to the chat with the `/web <url>` command.
+When you add a url to the chat, aider fetches the page and scrapes its
+content.
+
+By default, aider uses the `httpx` library to scrape web pages, but this only
+works on a subset of web pages.
+Some sites explicitly block requests from tools like httpx.
+Others rely heavily on javascript to render the page content,
+which isn't possible using only httpx.
+
+Aider works best with all web pages if you install
+Playwright's chromium browser and its dependencies:
+
+```
+playwright install --with-deps chromium
+```
+
+See the
+[Playwright for Python documentation](https://playwright.dev/python/docs/browsers#install-system-dependencies)
+for additional information.


 ## Enable voice coding (optional)
--- a/playw.py
+++ b/playw.py
@ -1,23 +0,0 @@
-from playwright.sync_api import sync_playwright
-import sys
-from playwright.__main__ import main as playwright_install
-
-def main(url):
-    # Check if Chromium is installed, if not, install it
-    with sync_playwright() as p:
-        p.chromium.launch()
-
-    with sync_playwright() as p:
-        browser = p.chromium.launch(user_agent='Aider v0.24.0-dev')
-        page = browser.new_page()
-        page.goto(url)
-        #page.wait_for_load_state('networkidle')
-        content = page.content()
-        print(content)
-        browser.close()
-
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python playw.py <URL>")
-        sys.exit(1)
-    main(sys.argv[1])
--- a/requirements.in
+++ b/requirements.in
@ -22,6 +22,4 @@ bs4
 PyYAML
 Pillow
 diff-match-patch
-webdriver_manager
-selenium
-chromedriver-autoinstaller
+playwright
--- a/requirements.txt
+++ b/requirements.txt
@ -13,9 +13,7 @@ anyio==4.2.0
 attrs==23.2.0
    # via
    #   jsonschema
-    #   outcome
    #   referencing
-    #   trio
 backoff==2.2.1
    # via -r requirements.in
 beautifulsoup4==4.12.3
@ -27,15 +25,12 @@ certifi==2023.11.17
    #   httpcore
    #   httpx
    #   requests
-    #   selenium
 cffi==1.16.0
    # via
    #   sounddevice
    #   soundfile
 charset-normalizer==3.3.2
    # via requests
-chromedriver-autoinstaller==0.3.1
-    # via -r requirements.in
 configargparse==1.7
    # via -r requirements.in
 diff-match-patch==20230430
@ -48,12 +43,12 @@ gitdb==4.0.11
    # via gitpython
 gitpython==3.1.40
    # via -r requirements.in
+greenlet==3.0.3
+    # via playwright
 grep-ast==0.2.4
    # via -r requirements.in
 h11==0.14.0
-    # via
-    #   httpcore
-    #   wsproto
+    # via httpcore
 httpcore==1.0.2
    # via httpx
 httpx==0.26.0
@ -63,7 +58,6 @@ idna==3.6
    #   anyio
    #   httpx
    #   requests
-    #   trio
 jsonschema==4.20.0
    # via -r requirements.in
 jsonschema-specifications==2023.12.1
@ -80,18 +74,16 @@ numpy==1.26.3
    #   scipy
 openai==1.6.1
    # via -r requirements.in
-outcome==1.3.0.post0
-    # via trio
 packaging==23.2
-    # via
-    #   -r requirements.in
-    #   webdriver-manager
+    # via -r requirements.in
 pathspec==0.12.1
    # via
    #   -r requirements.in
    #   grep-ast
 pillow==10.2.0
    # via -r requirements.in
+playwright==1.41.2
+    # via -r requirements.in
 prompt-toolkit==3.0.43
    # via -r requirements.in
 pycparser==2.21
@ -100,12 +92,10 @@ pydantic==2.5.3
    # via openai
 pydantic-core==2.14.6
    # via pydantic
+pyee==11.0.1
+    # via playwright
 pygments==2.17.2
    # via rich
-pysocks==1.7.1
-    # via urllib3
-python-dotenv==1.0.1
-    # via webdriver-manager
 pyyaml==6.0.1
    # via -r requirements.in
 referencing==0.32.0
@ -115,9 +105,7 @@ referencing==0.32.0
 regex==2023.12.25
    # via tiktoken
 requests==2.31.0
-    # via
-    #   tiktoken
-    #   webdriver-manager
+    # via tiktoken
 rich==13.7.0
    # via -r requirements.in
 rpds-py==0.16.2
@ -126,8 +114,6 @@ rpds-py==0.16.2
    #   referencing
 scipy==1.11.4
    # via -r requirements.in
-selenium==4.17.2
-    # via -r requirements.in
 smmap==5.0.1
    # via gitdb
 sniffio==1.3.0
@ -135,9 +121,6 @@ sniffio==1.3.0
    #   anyio
    #   httpx
    #   openai
-    #   trio
-sortedcontainers==2.4.0
-    # via trio
 sounddevice==0.4.6
    # via -r requirements.in
 soundfile==0.12.1
@ -152,26 +135,13 @@ tree-sitter==0.20.4
    # via tree-sitter-languages
 tree-sitter-languages==1.9.1
    # via grep-ast
-trio==0.24.0
-    # via
-    #   selenium
-    #   trio-websocket
-trio-websocket==0.11.1
-    # via selenium
 typing-extensions==4.9.0
    # via
    #   openai
    #   pydantic
    #   pydantic-core
-    #   selenium
-urllib3[socks]==2.1.0
-    # via
-    #   requests
-    #   selenium
-    #   urllib3
+    #   pyee
+urllib3==2.1.0
+    # via requests
 wcwidth==0.2.12
    # via prompt-toolkit
-webdriver-manager==4.0.1
-    # via -r requirements.in
-wsproto==1.2.0
-    # via trio-websocket