Roughly working scraper

This commit is contained in:
Paul Gauthier 2024-02-08 11:44:55 -08:00
parent 9bf3a6e0c6
commit 5b78d929a6
6 changed files with 77 additions and 208 deletions

View file

@ -1,19 +1,22 @@
#!/usr/bin/env python
import sys
from aider import __version__
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
aider_user_agent= f'Aider/{__version__} https://aider.chat'
from aider import __version__
PLAYWRIGHT_INFO = '''
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
PLAYWRIGHT_INFO = """
For better web scraping, install Playwright chromium:
playwright install --with-deps chromium
See https://aider.chat/docs/install.html#enable-playwright for more info.
'''
"""
class Scraper:
playwright_available = None
@ -29,15 +32,16 @@ class Scraper:
try:
browser = p.chromium.launch()
except Exception as e:
print(repr(e))
self.playwright_available = False
self.print_error(e)
return
page = browser.new_page()
user_agent = page.evaluate("navigator.userAgent")
user_agent = user_agent.replace('Headless','')
user_agent = user_agent.replace('headless', '')
user_agent += ' ' + aider_user_agent
user_agent = user_agent.replace("Headless", "")
user_agent = user_agent.replace("headless", "")
user_agent += " " + aider_user_agent
page = browser.new_page(user_agent=user_agent)
page.goto(url)
@ -49,26 +53,25 @@ class Scraper:
def try_playwright(self):
with sync_playwright() as p:
try:
browser = p.chromium.launch()
p.chromium.launch()
self.playwright_available = True
except Exception as e:
except Exception:
self.playwright_available = False
self.print_error(PLAYWRIGHT_INFO)
def scrape_with_httpx(self, url):
import httpx
headers = {
'User-Agent': aider_user_agent
}
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
try:
with httpx.Client(headers=headers) as client:
response = client.get(url)
response.raise_for_status()
return response.text
except httpx.HTTPError as http_err:
self.print_error(f'HTTP error occurred: {http_err}')
self.print_error(f"HTTP error occurred: {http_err}")
except Exception as err:
self.print_error(f'An error occurred: {err}')
self.print_error(f"An error occurred: {err}")
return None
def scrape(self, url):
@ -80,13 +83,35 @@ class Scraper:
else:
content = self.scrape_with_httpx(url)
content = html_to_text(content)
return content
# Adapted from AutoGPT, MIT License
#
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
def html_to_text(page_source: str) -> str:
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
def main(url):
scraper = Scraper()
content = scraper.scrape(url)
print(content)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python playw.py <URL>")

View file

@ -1,35 +1,13 @@
import os
import tempfile
from pathlib import Path
from typing import Type
import git
from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeDriverService
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.options import ArgOptions as BrowserOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.edge.service import Service as EdgeDriverService
from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as GeckoDriverService
from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.safari.options import Options as SafariOptions
from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
from aider.dump import dump # noqa: F401
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
class IgnorantTemporaryDirectory:
def __init__(self):
@ -139,104 +117,3 @@ def is_gpt4_with_openai_base_url(model_name, client):
if client is None or not hasattr(client, "base_url"):
return False
return model_name.startswith("gpt-4") and "api.openai.com" in client.base_url.host
# Taken from AutoGPT, MIT License
def open_page_in_browser(
url: str,
selenium_web_browser="chrome",
selenium_headless=True,
platform="linux",
user_agent="Aider CLI 0.23.0",
) -> WebDriver:
"""Open a browser window and load a web page using Selenium
Params:
url (str): The URL of the page to load
config (Config): The applicable application configuration
Returns:
driver (WebDriver): A driver object representing the browser window to scrape
"""
options_available: dict[str, Type[BrowserOptions]] = {
"chrome": ChromeOptions,
"edge": EdgeOptions,
"firefox": FirefoxOptions,
"safari": SafariOptions,
}
options: BrowserOptions = options_available[selenium_web_browser]()
options.add_argument(f"user-agent={user_agent}")
if selenium_web_browser == "firefox":
if selenium_headless:
options.headless = True
options.add_argument("--disable-gpu")
driver = FirefoxDriver(
service=GeckoDriverService(GeckoDriverManager().install()), options=options
)
elif selenium_web_browser == "edge":
driver = EdgeDriver(
service=EdgeDriverService(EdgeDriverManager().install()), options=options
)
elif selenium_web_browser == "safari":
# Requires a bit more setup on the users end.
# See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari # noqa: E501
driver = SafariDriver(options=options)
else:
if platform == "linux" or platform == "linux2":
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--no-sandbox")
if selenium_headless:
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
chromium_driver_path = Path("/usr/bin/chromedriver")
driver = ChromeDriver(
service=(
ChromeDriverService(str(chromium_driver_path))
if chromium_driver_path.exists()
else ChromeDriverService(ChromeDriverManager().install())
),
options=options,
)
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
return driver
# Taken from AutoGPT, MIT License
def scrape_text_with_selenium(driver: WebDriver) -> str:
"""Scrape text from a browser window using selenium
Args:
driver (WebDriver): A driver object representing the browser window to scrape
Returns:
str: the text scraped from the website
"""
# Get the HTML content directly from the browser's DOM
page_source = driver.execute_script("return document.body.outerHTML;")
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text
def scrape(url: str):
driver = open_page_in_browser(url)
text = scrape_text_with_selenium(driver)
driver.quit()
return text

View file

@ -77,6 +77,28 @@ Put a line in it like this to specify your api key:
openai-api-key: sk-...
```
## Enable Playwright
Aider supports adding web pages to the chat with the `/web <url>` command.
When you add a url to the chat, aider fetches the page and scrapes its
content.
By default, aider uses the `httpx` library to scrape web pages, but this only
works on a subset of web pages.
Some sites explicitly block requests from tools like httpx.
Others rely heavily on javascript to render the page content,
which isn't possible using only httpx.
Aider works best with all web pages if you install
Playwright's chromium browser and its dependencies:
```
playwright install --with-deps chromium
```
See the
[Playwright for Python documentation](https://playwright.dev/python/docs/browsers#install-system-dependencies)
for additional information.
## Enable voice coding (optional)

View file

@ -1,23 +0,0 @@
from playwright.sync_api import sync_playwright
import sys
from playwright.__main__ import main as playwright_install
def main(url):
# Check if Chromium is installed, if not, install it
with sync_playwright() as p:
p.chromium.launch()
with sync_playwright() as p:
browser = p.chromium.launch(user_agent='Aider v0.24.0-dev')
page = browser.new_page()
page.goto(url)
#page.wait_for_load_state('networkidle')
content = page.content()
print(content)
browser.close()
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python playw.py <URL>")
sys.exit(1)
main(sys.argv[1])

View file

@ -22,6 +22,4 @@ bs4
PyYAML
Pillow
diff-match-patch
webdriver_manager
selenium
chromedriver-autoinstaller
playwright

View file

@ -13,9 +13,7 @@ anyio==4.2.0
attrs==23.2.0
# via
# jsonschema
# outcome
# referencing
# trio
backoff==2.2.1
# via -r requirements.in
beautifulsoup4==4.12.3
@ -27,15 +25,12 @@ certifi==2023.11.17
# httpcore
# httpx
# requests
# selenium
cffi==1.16.0
# via
# sounddevice
# soundfile
charset-normalizer==3.3.2
# via requests
chromedriver-autoinstaller==0.3.1
# via -r requirements.in
configargparse==1.7
# via -r requirements.in
diff-match-patch==20230430
@ -48,12 +43,12 @@ gitdb==4.0.11
# via gitpython
gitpython==3.1.40
# via -r requirements.in
greenlet==3.0.3
# via playwright
grep-ast==0.2.4
# via -r requirements.in
h11==0.14.0
# via
# httpcore
# wsproto
# via httpcore
httpcore==1.0.2
# via httpx
httpx==0.26.0
@ -63,7 +58,6 @@ idna==3.6
# anyio
# httpx
# requests
# trio
jsonschema==4.20.0
# via -r requirements.in
jsonschema-specifications==2023.12.1
@ -80,18 +74,16 @@ numpy==1.26.3
# scipy
openai==1.6.1
# via -r requirements.in
outcome==1.3.0.post0
# via trio
packaging==23.2
# via
# -r requirements.in
# webdriver-manager
# via -r requirements.in
pathspec==0.12.1
# via
# -r requirements.in
# grep-ast
pillow==10.2.0
# via -r requirements.in
playwright==1.41.2
# via -r requirements.in
prompt-toolkit==3.0.43
# via -r requirements.in
pycparser==2.21
@ -100,12 +92,10 @@ pydantic==2.5.3
# via openai
pydantic-core==2.14.6
# via pydantic
pyee==11.0.1
# via playwright
pygments==2.17.2
# via rich
pysocks==1.7.1
# via urllib3
python-dotenv==1.0.1
# via webdriver-manager
pyyaml==6.0.1
# via -r requirements.in
referencing==0.32.0
@ -115,9 +105,7 @@ referencing==0.32.0
regex==2023.12.25
# via tiktoken
requests==2.31.0
# via
# tiktoken
# webdriver-manager
# via tiktoken
rich==13.7.0
# via -r requirements.in
rpds-py==0.16.2
@ -126,8 +114,6 @@ rpds-py==0.16.2
# referencing
scipy==1.11.4
# via -r requirements.in
selenium==4.17.2
# via -r requirements.in
smmap==5.0.1
# via gitdb
sniffio==1.3.0
@ -135,9 +121,6 @@ sniffio==1.3.0
# anyio
# httpx
# openai
# trio
sortedcontainers==2.4.0
# via trio
sounddevice==0.4.6
# via -r requirements.in
soundfile==0.12.1
@ -152,26 +135,13 @@ tree-sitter==0.20.4
# via tree-sitter-languages
tree-sitter-languages==1.9.1
# via grep-ast
trio==0.24.0
# via
# selenium
# trio-websocket
trio-websocket==0.11.1
# via selenium
typing-extensions==4.9.0
# via
# openai
# pydantic
# pydantic-core
# selenium
urllib3[socks]==2.1.0
# via
# requests
# selenium
# urllib3
# pyee
urllib3==2.1.0
# via requests
wcwidth==0.2.12
# via prompt-toolkit
webdriver-manager==4.0.1
# via -r requirements.in
wsproto==1.2.0
# via trio-websocket