Roughly working scraper

2025-06-05 12:14:59 +00:00 · 2024-02-08 11:44:55 -08:00 · 2024-02-08 11:44:55 -08:00 · 5b78d929a6
commit 5b78d929a6
parent 9bf3a6e0c6
6 changed files with 77 additions and 208 deletions
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -1,19 +1,22 @@
 #!/usr/bin/env python

 import sys
-from aider import __version__

+from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright

-aider_user_agent= f'Aider/{__version__} https://aider.chat'
+from aider import __version__

-PLAYWRIGHT_INFO = '''
+aider_user_agent = f"Aider/{__version__} +https://aider.chat"
+
+PLAYWRIGHT_INFO = """
 For better web scraping, install Playwright chromium:

    playwright install --with-deps chromium

 See https://aider.chat/docs/install.html#enable-playwright for more info.
-'''
+"""
+

 class Scraper:
    playwright_available = None
@ -29,15 +32,16 @@ class Scraper:
            try:
                browser = p.chromium.launch()
            except Exception as e:
-                print(repr(e))
+                self.playwright_available = False
+                self.print_error(e)
                return

            page = browser.new_page()

            user_agent = page.evaluate("navigator.userAgent")
-            user_agent = user_agent.replace('Headless','')
-            user_agent = user_agent.replace('headless', '')
-            user_agent += ' ' + aider_user_agent
+            user_agent = user_agent.replace("Headless", "")
+            user_agent = user_agent.replace("headless", "")
+            user_agent += " " + aider_user_agent

            page = browser.new_page(user_agent=user_agent)
            page.goto(url)
@ -49,26 +53,25 @@ class Scraper:
    def try_playwright(self):
        with sync_playwright() as p:
            try:
-                browser = p.chromium.launch()
+                p.chromium.launch()
                self.playwright_available = True
-            except Exception as e:
+            except Exception:
                self.playwright_available = False
                self.print_error(PLAYWRIGHT_INFO)

    def scrape_with_httpx(self, url):
        import httpx
-        headers = {
-            'User-Agent': aider_user_agent
-        }
+
+        headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
        try:
            with httpx.Client(headers=headers) as client:
                response = client.get(url)
                response.raise_for_status()
                return response.text
        except httpx.HTTPError as http_err:
-            self.print_error(f'HTTP error occurred: {http_err}')
+            self.print_error(f"HTTP error occurred: {http_err}")
        except Exception as err:
-            self.print_error(f'An error occurred: {err}')
+            self.print_error(f"An error occurred: {err}")
        return None

    def scrape(self, url):
@ -80,13 +83,35 @@ class Scraper:
        else:
            content = self.scrape_with_httpx(url)

+        content = html_to_text(content)
+
        return content

+
+# Adapted from AutoGPT, MIT License
+#
+# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
+
+
+def html_to_text(page_source: str) -> str:
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    for script in soup(["script", "style"]):
+        script.extract()
+
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = "\n".join(chunk for chunk in chunks if chunk)
+    return text
+
+
 def main(url):
    scraper = Scraper()
    content = scraper.scrape(url)
    print(content)

+
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python playw.py <URL>")