mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-05 12:14:59 +00:00
Roughly working scraper
This commit is contained in:
parent
9bf3a6e0c6
commit
5b78d929a6
6 changed files with 77 additions and 208 deletions
|
@ -1,19 +1,22 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
from aider import __version__
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
aider_user_agent= f'Aider/{__version__} https://aider.chat'
|
||||
from aider import __version__
|
||||
|
||||
PLAYWRIGHT_INFO = '''
|
||||
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
|
||||
|
||||
PLAYWRIGHT_INFO = """
|
||||
For better web scraping, install Playwright chromium:
|
||||
|
||||
playwright install --with-deps chromium
|
||||
|
||||
See https://aider.chat/docs/install.html#enable-playwright for more info.
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class Scraper:
|
||||
playwright_available = None
|
||||
|
@ -29,15 +32,16 @@ class Scraper:
|
|||
try:
|
||||
browser = p.chromium.launch()
|
||||
except Exception as e:
|
||||
print(repr(e))
|
||||
self.playwright_available = False
|
||||
self.print_error(e)
|
||||
return
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
user_agent = page.evaluate("navigator.userAgent")
|
||||
user_agent = user_agent.replace('Headless','')
|
||||
user_agent = user_agent.replace('headless', '')
|
||||
user_agent += ' ' + aider_user_agent
|
||||
user_agent = user_agent.replace("Headless", "")
|
||||
user_agent = user_agent.replace("headless", "")
|
||||
user_agent += " " + aider_user_agent
|
||||
|
||||
page = browser.new_page(user_agent=user_agent)
|
||||
page.goto(url)
|
||||
|
@ -49,26 +53,25 @@ class Scraper:
|
|||
def try_playwright(self):
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
browser = p.chromium.launch()
|
||||
p.chromium.launch()
|
||||
self.playwright_available = True
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
self.playwright_available = False
|
||||
self.print_error(PLAYWRIGHT_INFO)
|
||||
|
||||
def scrape_with_httpx(self, url):
|
||||
import httpx
|
||||
headers = {
|
||||
'User-Agent': aider_user_agent
|
||||
}
|
||||
|
||||
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
|
||||
try:
|
||||
with httpx.Client(headers=headers) as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except httpx.HTTPError as http_err:
|
||||
self.print_error(f'HTTP error occurred: {http_err}')
|
||||
self.print_error(f"HTTP error occurred: {http_err}")
|
||||
except Exception as err:
|
||||
self.print_error(f'An error occurred: {err}')
|
||||
self.print_error(f"An error occurred: {err}")
|
||||
return None
|
||||
|
||||
def scrape(self, url):
|
||||
|
@ -80,13 +83,35 @@ class Scraper:
|
|||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
content = html_to_text(content)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
# Adapted from AutoGPT, MIT License
|
||||
#
|
||||
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
|
||||
|
||||
|
||||
def html_to_text(page_source: str) -> str:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
return text
|
||||
|
||||
|
||||
def main(url):
|
||||
scraper = Scraper()
|
||||
content = scraper.scrape(url)
|
||||
print(content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python playw.py <URL>")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue