mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-28 08:14:59 +00:00
Roughly working scraper
This commit is contained in:
parent
9bf3a6e0c6
commit
5b78d929a6
6 changed files with 77 additions and 208 deletions
|
@ -1,19 +1,22 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
from aider import __version__
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
aider_user_agent= f'Aider/{__version__} https://aider.chat'
|
||||
from aider import __version__
|
||||
|
||||
PLAYWRIGHT_INFO = '''
|
||||
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
|
||||
|
||||
PLAYWRIGHT_INFO = """
|
||||
For better web scraping, install Playwright chromium:
|
||||
|
||||
playwright install --with-deps chromium
|
||||
|
||||
See https://aider.chat/docs/install.html#enable-playwright for more info.
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class Scraper:
|
||||
playwright_available = None
|
||||
|
@ -29,15 +32,16 @@ class Scraper:
|
|||
try:
|
||||
browser = p.chromium.launch()
|
||||
except Exception as e:
|
||||
print(repr(e))
|
||||
self.playwright_available = False
|
||||
self.print_error(e)
|
||||
return
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
user_agent = page.evaluate("navigator.userAgent")
|
||||
user_agent = user_agent.replace('Headless','')
|
||||
user_agent = user_agent.replace('headless', '')
|
||||
user_agent += ' ' + aider_user_agent
|
||||
user_agent = user_agent.replace("Headless", "")
|
||||
user_agent = user_agent.replace("headless", "")
|
||||
user_agent += " " + aider_user_agent
|
||||
|
||||
page = browser.new_page(user_agent=user_agent)
|
||||
page.goto(url)
|
||||
|
@ -49,26 +53,25 @@ class Scraper:
|
|||
def try_playwright(self):
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
browser = p.chromium.launch()
|
||||
p.chromium.launch()
|
||||
self.playwright_available = True
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
self.playwright_available = False
|
||||
self.print_error(PLAYWRIGHT_INFO)
|
||||
|
||||
def scrape_with_httpx(self, url):
|
||||
import httpx
|
||||
headers = {
|
||||
'User-Agent': aider_user_agent
|
||||
}
|
||||
|
||||
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
|
||||
try:
|
||||
with httpx.Client(headers=headers) as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except httpx.HTTPError as http_err:
|
||||
self.print_error(f'HTTP error occurred: {http_err}')
|
||||
self.print_error(f"HTTP error occurred: {http_err}")
|
||||
except Exception as err:
|
||||
self.print_error(f'An error occurred: {err}')
|
||||
self.print_error(f"An error occurred: {err}")
|
||||
return None
|
||||
|
||||
def scrape(self, url):
|
||||
|
@ -80,13 +83,35 @@ class Scraper:
|
|||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
content = html_to_text(content)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
# Adapted from AutoGPT, MIT License
|
||||
#
|
||||
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
|
||||
|
||||
|
||||
def html_to_text(page_source: str) -> str:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
return text
|
||||
|
||||
|
||||
def main(url):
|
||||
scraper = Scraper()
|
||||
content = scraper.scrape(url)
|
||||
print(content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python playw.py <URL>")
|
||||
|
|
127
aider/utils.py
127
aider/utils.py
|
@ -1,35 +1,13 @@
|
|||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Type
|
||||
|
||||
import git
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.chrome.service import Service as ChromeDriverService
|
||||
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.common.options import ArgOptions as BrowserOptions
|
||||
from selenium.webdriver.edge.options import Options as EdgeOptions
|
||||
from selenium.webdriver.edge.service import Service as EdgeDriverService
|
||||
from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
|
||||
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
||||
from selenium.webdriver.firefox.service import Service as GeckoDriverService
|
||||
from selenium.webdriver.firefox.webdriver import WebDriver as FirefoxDriver
|
||||
from selenium.webdriver.remote.webdriver import WebDriver
|
||||
from selenium.webdriver.safari.options import Options as SafariOptions
|
||||
from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.wait import WebDriverWait
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from webdriver_manager.firefox import GeckoDriverManager
|
||||
from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
|
||||
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
||||
|
||||
from aider.dump import dump # noqa: F401
|
||||
|
||||
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
||||
|
||||
|
||||
class IgnorantTemporaryDirectory:
|
||||
def __init__(self):
|
||||
|
@ -139,104 +117,3 @@ def is_gpt4_with_openai_base_url(model_name, client):
|
|||
if client is None or not hasattr(client, "base_url"):
|
||||
return False
|
||||
return model_name.startswith("gpt-4") and "api.openai.com" in client.base_url.host
|
||||
|
||||
|
||||
# Taken from AutoGPT, MIT License
|
||||
def open_page_in_browser(
|
||||
url: str,
|
||||
selenium_web_browser="chrome",
|
||||
selenium_headless=True,
|
||||
platform="linux",
|
||||
user_agent="Aider CLI 0.23.0",
|
||||
) -> WebDriver:
|
||||
"""Open a browser window and load a web page using Selenium
|
||||
|
||||
Params:
|
||||
url (str): The URL of the page to load
|
||||
config (Config): The applicable application configuration
|
||||
|
||||
Returns:
|
||||
driver (WebDriver): A driver object representing the browser window to scrape
|
||||
"""
|
||||
options_available: dict[str, Type[BrowserOptions]] = {
|
||||
"chrome": ChromeOptions,
|
||||
"edge": EdgeOptions,
|
||||
"firefox": FirefoxOptions,
|
||||
"safari": SafariOptions,
|
||||
}
|
||||
|
||||
options: BrowserOptions = options_available[selenium_web_browser]()
|
||||
options.add_argument(f"user-agent={user_agent}")
|
||||
|
||||
if selenium_web_browser == "firefox":
|
||||
if selenium_headless:
|
||||
options.headless = True
|
||||
options.add_argument("--disable-gpu")
|
||||
driver = FirefoxDriver(
|
||||
service=GeckoDriverService(GeckoDriverManager().install()), options=options
|
||||
)
|
||||
elif selenium_web_browser == "edge":
|
||||
driver = EdgeDriver(
|
||||
service=EdgeDriverService(EdgeDriverManager().install()), options=options
|
||||
)
|
||||
elif selenium_web_browser == "safari":
|
||||
# Requires a bit more setup on the users end.
|
||||
# See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari # noqa: E501
|
||||
driver = SafariDriver(options=options)
|
||||
else:
|
||||
if platform == "linux" or platform == "linux2":
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_argument("--remote-debugging-port=9222")
|
||||
|
||||
options.add_argument("--no-sandbox")
|
||||
if selenium_headless:
|
||||
options.add_argument("--headless=new")
|
||||
options.add_argument("--disable-gpu")
|
||||
|
||||
chromium_driver_path = Path("/usr/bin/chromedriver")
|
||||
|
||||
driver = ChromeDriver(
|
||||
service=(
|
||||
ChromeDriverService(str(chromium_driver_path))
|
||||
if chromium_driver_path.exists()
|
||||
else ChromeDriverService(ChromeDriverManager().install())
|
||||
),
|
||||
options=options,
|
||||
)
|
||||
driver.get(url)
|
||||
|
||||
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
||||
|
||||
return driver
|
||||
|
||||
|
||||
# Taken from AutoGPT, MIT License
|
||||
def scrape_text_with_selenium(driver: WebDriver) -> str:
|
||||
"""Scrape text from a browser window using selenium
|
||||
|
||||
Args:
|
||||
driver (WebDriver): A driver object representing the browser window to scrape
|
||||
|
||||
Returns:
|
||||
str: the text scraped from the website
|
||||
"""
|
||||
|
||||
# Get the HTML content directly from the browser's DOM
|
||||
page_source = driver.execute_script("return document.body.outerHTML;")
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
return text
|
||||
|
||||
|
||||
def scrape(url: str):
|
||||
driver = open_page_in_browser(url)
|
||||
text = scrape_text_with_selenium(driver)
|
||||
driver.quit()
|
||||
return text
|
||||
|
|
|
@ -77,6 +77,28 @@ Put a line in it like this to specify your api key:
|
|||
openai-api-key: sk-...
|
||||
```
|
||||
|
||||
## Enable Playwright
|
||||
|
||||
Aider supports adding web pages to the chat with the `/web <url>` command.
|
||||
When you add a url to the chat, aider fetches the page and scrapes its
|
||||
content.
|
||||
|
||||
By default, aider uses the `httpx` library to scrape web pages, but this only
|
||||
works on a subset of web pages.
|
||||
Some sites explicitly block requests from tools like httpx.
|
||||
Others rely heavily on javascript to render the page content,
|
||||
which isn't possible using only httpx.
|
||||
|
||||
Aider works best with all web pages if you install
|
||||
Playwright's chromium browser and its dependencies:
|
||||
|
||||
```
|
||||
playwright install --with-deps chromium
|
||||
```
|
||||
|
||||
See the
|
||||
[Playwright for Python documentation](https://playwright.dev/python/docs/browsers#install-system-dependencies)
|
||||
for additional information.
|
||||
|
||||
|
||||
## Enable voice coding (optional)
|
||||
|
|
23
playw.py
23
playw.py
|
@ -1,23 +0,0 @@
|
|||
from playwright.sync_api import sync_playwright
|
||||
import sys
|
||||
from playwright.__main__ import main as playwright_install
|
||||
|
||||
def main(url):
|
||||
# Check if Chromium is installed, if not, install it
|
||||
with sync_playwright() as p:
|
||||
p.chromium.launch()
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(user_agent='Aider v0.24.0-dev')
|
||||
page = browser.new_page()
|
||||
page.goto(url)
|
||||
#page.wait_for_load_state('networkidle')
|
||||
content = page.content()
|
||||
print(content)
|
||||
browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python playw.py <URL>")
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
|
@ -22,6 +22,4 @@ bs4
|
|||
PyYAML
|
||||
Pillow
|
||||
diff-match-patch
|
||||
webdriver_manager
|
||||
selenium
|
||||
chromedriver-autoinstaller
|
||||
playwright
|
||||
|
|
|
@ -13,9 +13,7 @@ anyio==4.2.0
|
|||
attrs==23.2.0
|
||||
# via
|
||||
# jsonschema
|
||||
# outcome
|
||||
# referencing
|
||||
# trio
|
||||
backoff==2.2.1
|
||||
# via -r requirements.in
|
||||
beautifulsoup4==4.12.3
|
||||
|
@ -27,15 +25,12 @@ certifi==2023.11.17
|
|||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
# selenium
|
||||
cffi==1.16.0
|
||||
# via
|
||||
# sounddevice
|
||||
# soundfile
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
chromedriver-autoinstaller==0.3.1
|
||||
# via -r requirements.in
|
||||
configargparse==1.7
|
||||
# via -r requirements.in
|
||||
diff-match-patch==20230430
|
||||
|
@ -48,12 +43,12 @@ gitdb==4.0.11
|
|||
# via gitpython
|
||||
gitpython==3.1.40
|
||||
# via -r requirements.in
|
||||
greenlet==3.0.3
|
||||
# via playwright
|
||||
grep-ast==0.2.4
|
||||
# via -r requirements.in
|
||||
h11==0.14.0
|
||||
# via
|
||||
# httpcore
|
||||
# wsproto
|
||||
# via httpcore
|
||||
httpcore==1.0.2
|
||||
# via httpx
|
||||
httpx==0.26.0
|
||||
|
@ -63,7 +58,6 @@ idna==3.6
|
|||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
# trio
|
||||
jsonschema==4.20.0
|
||||
# via -r requirements.in
|
||||
jsonschema-specifications==2023.12.1
|
||||
|
@ -80,18 +74,16 @@ numpy==1.26.3
|
|||
# scipy
|
||||
openai==1.6.1
|
||||
# via -r requirements.in
|
||||
outcome==1.3.0.post0
|
||||
# via trio
|
||||
packaging==23.2
|
||||
# via
|
||||
# -r requirements.in
|
||||
# webdriver-manager
|
||||
# via -r requirements.in
|
||||
pathspec==0.12.1
|
||||
# via
|
||||
# -r requirements.in
|
||||
# grep-ast
|
||||
pillow==10.2.0
|
||||
# via -r requirements.in
|
||||
playwright==1.41.2
|
||||
# via -r requirements.in
|
||||
prompt-toolkit==3.0.43
|
||||
# via -r requirements.in
|
||||
pycparser==2.21
|
||||
|
@ -100,12 +92,10 @@ pydantic==2.5.3
|
|||
# via openai
|
||||
pydantic-core==2.14.6
|
||||
# via pydantic
|
||||
pyee==11.0.1
|
||||
# via playwright
|
||||
pygments==2.17.2
|
||||
# via rich
|
||||
pysocks==1.7.1
|
||||
# via urllib3
|
||||
python-dotenv==1.0.1
|
||||
# via webdriver-manager
|
||||
pyyaml==6.0.1
|
||||
# via -r requirements.in
|
||||
referencing==0.32.0
|
||||
|
@ -115,9 +105,7 @@ referencing==0.32.0
|
|||
regex==2023.12.25
|
||||
# via tiktoken
|
||||
requests==2.31.0
|
||||
# via
|
||||
# tiktoken
|
||||
# webdriver-manager
|
||||
# via tiktoken
|
||||
rich==13.7.0
|
||||
# via -r requirements.in
|
||||
rpds-py==0.16.2
|
||||
|
@ -126,8 +114,6 @@ rpds-py==0.16.2
|
|||
# referencing
|
||||
scipy==1.11.4
|
||||
# via -r requirements.in
|
||||
selenium==4.17.2
|
||||
# via -r requirements.in
|
||||
smmap==5.0.1
|
||||
# via gitdb
|
||||
sniffio==1.3.0
|
||||
|
@ -135,9 +121,6 @@ sniffio==1.3.0
|
|||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
# trio
|
||||
sortedcontainers==2.4.0
|
||||
# via trio
|
||||
sounddevice==0.4.6
|
||||
# via -r requirements.in
|
||||
soundfile==0.12.1
|
||||
|
@ -152,26 +135,13 @@ tree-sitter==0.20.4
|
|||
# via tree-sitter-languages
|
||||
tree-sitter-languages==1.9.1
|
||||
# via grep-ast
|
||||
trio==0.24.0
|
||||
# via
|
||||
# selenium
|
||||
# trio-websocket
|
||||
trio-websocket==0.11.1
|
||||
# via selenium
|
||||
typing-extensions==4.9.0
|
||||
# via
|
||||
# openai
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# selenium
|
||||
urllib3[socks]==2.1.0
|
||||
# via
|
||||
# requests
|
||||
# selenium
|
||||
# urllib3
|
||||
# pyee
|
||||
urllib3==2.1.0
|
||||
# via requests
|
||||
wcwidth==0.2.12
|
||||
# via prompt-toolkit
|
||||
webdriver-manager==4.0.1
|
||||
# via -r requirements.in
|
||||
wsproto==1.2.0
|
||||
# via trio-websocket
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue