diff --git a/aider/scrape.py b/aider/scrape.py index 1e6bb656b..737bb6561 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -1,19 +1,22 @@ #!/usr/bin/env python import sys -from aider import __version__ +from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright -aider_user_agent= f'Aider/{__version__} https://aider.chat' +from aider import __version__ -PLAYWRIGHT_INFO = ''' +aider_user_agent = f"Aider/{__version__} +https://aider.chat" + +PLAYWRIGHT_INFO = """ For better web scraping, install Playwright chromium: playwright install --with-deps chromium See https://aider.chat/docs/install.html#enable-playwright for more info. -''' +""" + class Scraper: playwright_available = None @@ -29,15 +32,16 @@ class Scraper: try: browser = p.chromium.launch() except Exception as e: - print(repr(e)) + self.playwright_available = False + self.print_error(e) return page = browser.new_page() user_agent = page.evaluate("navigator.userAgent") - user_agent = user_agent.replace('Headless','') - user_agent = user_agent.replace('headless', '') - user_agent += ' ' + aider_user_agent + user_agent = user_agent.replace("Headless", "") + user_agent = user_agent.replace("headless", "") + user_agent += " " + aider_user_agent page = browser.new_page(user_agent=user_agent) page.goto(url) @@ -49,26 +53,25 @@ class Scraper: def try_playwright(self): with sync_playwright() as p: try: - browser = p.chromium.launch() + p.chromium.launch() self.playwright_available = True - except Exception as e: + except Exception: self.playwright_available = False self.print_error(PLAYWRIGHT_INFO) def scrape_with_httpx(self, url): import httpx - headers = { - 'User-Agent': aider_user_agent - } + + headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"} try: with httpx.Client(headers=headers) as client: response = client.get(url) response.raise_for_status() return response.text except httpx.HTTPError as http_err: - self.print_error(f'HTTP error occurred: {http_err}') + self.print_error(f"HTTP error occurred: {http_err}") except Exception as err: - self.print_error(f'An error occurred: {err}') + self.print_error(f"An error occurred: {err}") return None def scrape(self, url): @@ -80,13 +83,35 @@ class Scraper: else: content = self.scrape_with_httpx(url) + content = html_to_text(content) + return content + +# Adapted from AutoGPT, MIT License +# +# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173 + + +def html_to_text(page_source: str) -> str: + soup = BeautifulSoup(page_source, "html.parser") + + for script in soup(["script", "style"]): + script.extract() + + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = "\n".join(chunk for chunk in chunks if chunk) + return text + + def main(url): scraper = Scraper() content = scraper.scrape(url) print(content) + if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python playw.py ") diff --git a/aider/utils.py b/aider/utils.py index 62ac5caef..0dd316600 100644 --- a/aider/utils.py +++ b/aider/utils.py @@ -1,35 +1,13 @@ import os import tempfile from pathlib import Path -from typing import Type import git -from bs4 import BeautifulSoup -from selenium.common.exceptions import WebDriverException -from selenium.webdriver.chrome.options import Options as ChromeOptions -from selenium.webdriver.chrome.service import Service as ChromeDriverService -from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver -from selenium.webdriver.common.by import By -from selenium.webdriver.common.options import ArgOptions as BrowserOptions -from selenium.webdriver.edge.options import Options as EdgeOptions -from selenium.webdriver.edge.service import Service as EdgeDriverService -from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver -from selenium.webdriver.firefox.options import Options as FirefoxOptions -from selenium.webdriver.firefox.service import Service as GeckoDriverService -from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver -from selenium.webdriver.remote.webdriver import WebDriver -from selenium.webdriver.safari.options import Options as SafariOptions -from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.wait import WebDriverWait -from webdriver_manager.chrome import ChromeDriverManager -from webdriver_manager.firefox import GeckoDriverManager -from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager - -IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"} from aider.dump import dump # noqa: F401 +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"} + class IgnorantTemporaryDirectory: def __init__(self): @@ -139,104 +117,3 @@ def is_gpt4_with_openai_base_url(model_name, client): if client is None or not hasattr(client, "base_url"): return False return model_name.startswith("gpt-4") and "api.openai.com" in client.base_url.host - - -# Taken from AutoGPT, MIT License -def open_page_in_browser( - url: str, - selenium_web_browser="chrome", - selenium_headless=True, - platform="linux", - user_agent="Aider CLI 0.23.0", -) -> WebDriver: - """Open a browser window and load a web page using Selenium - - Params: - url (str): The URL of the page to load - config (Config): The applicable application configuration - - Returns: - driver (WebDriver): A driver object representing the browser window to scrape - """ - options_available: dict[str, Type[BrowserOptions]] = { - "chrome": ChromeOptions, - "edge": EdgeOptions, - "firefox": FirefoxOptions, - "safari": SafariOptions, - } - - options: BrowserOptions = options_available[selenium_web_browser]() - options.add_argument(f"user-agent={user_agent}") - - if selenium_web_browser == "firefox": - if selenium_headless: - options.headless = True - options.add_argument("--disable-gpu") - driver = FirefoxDriver( - service=GeckoDriverService(GeckoDriverManager().install()), options=options - ) - elif selenium_web_browser == "edge": - driver = EdgeDriver( - service=EdgeDriverService(EdgeDriverManager().install()), options=options - ) - elif selenium_web_browser == "safari": - # Requires a bit more setup on the users end. - # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari # noqa: E501 - driver = SafariDriver(options=options) - else: - if platform == "linux" or platform == "linux2": - options.add_argument("--disable-dev-shm-usage") - options.add_argument("--remote-debugging-port=9222") - - options.add_argument("--no-sandbox") - if selenium_headless: - options.add_argument("--headless=new") - options.add_argument("--disable-gpu") - - chromium_driver_path = Path("/usr/bin/chromedriver") - - driver = ChromeDriver( - service=( - ChromeDriverService(str(chromium_driver_path)) - if chromium_driver_path.exists() - else ChromeDriverService(ChromeDriverManager().install()) - ), - options=options, - ) - driver.get(url) - - WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) - - return driver - - -# Taken from AutoGPT, MIT License -def scrape_text_with_selenium(driver: WebDriver) -> str: - """Scrape text from a browser window using selenium - - Args: - driver (WebDriver): A driver object representing the browser window to scrape - - Returns: - str: the text scraped from the website - """ - - # Get the HTML content directly from the browser's DOM - page_source = driver.execute_script("return document.body.outerHTML;") - soup = BeautifulSoup(page_source, "html.parser") - - for script in soup(["script", "style"]): - script.extract() - - text = soup.get_text() - lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - text = "\n".join(chunk for chunk in chunks if chunk) - return text - - -def scrape(url: str): - driver = open_page_in_browser(url) - text = scrape_text_with_selenium(driver) - driver.quit() - return text diff --git a/docs/install.md b/docs/install.md index 44e03c699..bb892cbb6 100644 --- a/docs/install.md +++ b/docs/install.md @@ -77,6 +77,28 @@ Put a line in it like this to specify your api key: openai-api-key: sk-... ``` +## Enable Playwright + +Aider supports adding web pages to the chat with the `/web ` command. +When you add a url to the chat, aider fetches the page and scrapes its +content. + +By default, aider uses the `httpx` library to scrape web pages, but this only +works on a subset of web pages. +Some sites explicitly block requests from tools like httpx. +Others rely heavily on javascript to render the page content, +which isn't possible using only httpx. + +Aider works best with all web pages if you install +Playwright's chromium browser and its dependencies: + +``` +playwright install --with-deps chromium +``` + +See the +[Playwright for Python documentation](https://playwright.dev/python/docs/browsers#install-system-dependencies) +for additional information. ## Enable voice coding (optional) diff --git a/playw.py b/playw.py deleted file mode 100644 index ce3521c22..000000000 --- a/playw.py +++ /dev/null @@ -1,23 +0,0 @@ -from playwright.sync_api import sync_playwright -import sys -from playwright.__main__ import main as playwright_install - -def main(url): - # Check if Chromium is installed, if not, install it - with sync_playwright() as p: - p.chromium.launch() - - with sync_playwright() as p: - browser = p.chromium.launch(user_agent='Aider v0.24.0-dev') - page = browser.new_page() - page.goto(url) - #page.wait_for_load_state('networkidle') - content = page.content() - print(content) - browser.close() - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python playw.py ") - sys.exit(1) - main(sys.argv[1]) diff --git a/requirements.in b/requirements.in index 1e9e9bc75..200fc442e 100644 --- a/requirements.in +++ b/requirements.in @@ -22,6 +22,4 @@ bs4 PyYAML Pillow diff-match-patch -webdriver_manager -selenium -chromedriver-autoinstaller +playwright diff --git a/requirements.txt b/requirements.txt index 16546e483..52545514f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,9 +13,7 @@ anyio==4.2.0 attrs==23.2.0 # via # jsonschema - # outcome # referencing - # trio backoff==2.2.1 # via -r requirements.in beautifulsoup4==4.12.3 @@ -27,15 +25,12 @@ certifi==2023.11.17 # httpcore # httpx # requests - # selenium cffi==1.16.0 # via # sounddevice # soundfile charset-normalizer==3.3.2 # via requests -chromedriver-autoinstaller==0.3.1 - # via -r requirements.in configargparse==1.7 # via -r requirements.in diff-match-patch==20230430 @@ -48,12 +43,12 @@ gitdb==4.0.11 # via gitpython gitpython==3.1.40 # via -r requirements.in +greenlet==3.0.3 + # via playwright grep-ast==0.2.4 # via -r requirements.in h11==0.14.0 - # via - # httpcore - # wsproto + # via httpcore httpcore==1.0.2 # via httpx httpx==0.26.0 @@ -63,7 +58,6 @@ idna==3.6 # anyio # httpx # requests - # trio jsonschema==4.20.0 # via -r requirements.in jsonschema-specifications==2023.12.1 @@ -80,18 +74,16 @@ numpy==1.26.3 # scipy openai==1.6.1 # via -r requirements.in -outcome==1.3.0.post0 - # via trio packaging==23.2 - # via - # -r requirements.in - # webdriver-manager + # via -r requirements.in pathspec==0.12.1 # via # -r requirements.in # grep-ast pillow==10.2.0 # via -r requirements.in +playwright==1.41.2 + # via -r requirements.in prompt-toolkit==3.0.43 # via -r requirements.in pycparser==2.21 @@ -100,12 +92,10 @@ pydantic==2.5.3 # via openai pydantic-core==2.14.6 # via pydantic +pyee==11.0.1 + # via playwright pygments==2.17.2 # via rich -pysocks==1.7.1 - # via urllib3 -python-dotenv==1.0.1 - # via webdriver-manager pyyaml==6.0.1 # via -r requirements.in referencing==0.32.0 @@ -115,9 +105,7 @@ referencing==0.32.0 regex==2023.12.25 # via tiktoken requests==2.31.0 - # via - # tiktoken - # webdriver-manager + # via tiktoken rich==13.7.0 # via -r requirements.in rpds-py==0.16.2 @@ -126,8 +114,6 @@ rpds-py==0.16.2 # referencing scipy==1.11.4 # via -r requirements.in -selenium==4.17.2 - # via -r requirements.in smmap==5.0.1 # via gitdb sniffio==1.3.0 @@ -135,9 +121,6 @@ sniffio==1.3.0 # anyio # httpx # openai - # trio -sortedcontainers==2.4.0 - # via trio sounddevice==0.4.6 # via -r requirements.in soundfile==0.12.1 @@ -152,26 +135,13 @@ tree-sitter==0.20.4 # via tree-sitter-languages tree-sitter-languages==1.9.1 # via grep-ast -trio==0.24.0 - # via - # selenium - # trio-websocket -trio-websocket==0.11.1 - # via selenium typing-extensions==4.9.0 # via # openai # pydantic # pydantic-core - # selenium -urllib3[socks]==2.1.0 - # via - # requests - # selenium - # urllib3 + # pyee +urllib3==2.1.0 + # via requests wcwidth==0.2.12 # via prompt-toolkit -webdriver-manager==4.0.1 - # via -r requirements.in -wsproto==1.2.0 - # via trio-websocket