Refactor scraper to use Playwright for web scraping and handle user agent string.

2025-05-31 17:55:01 +00:00 · 2024-02-08 11:29:44 -08:00 · 2024-02-08 11:29:44 -08:00 · e04187d1ad
commit e04187d1ad
parent f10a9d2e7b
1 changed files with 65 additions and 16 deletions
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -5,25 +5,74 @@ from aider import __version__

 from playwright.sync_api import sync_playwright

-aider_url = 'https://github.com/paul-gauthier/aider'
+aider_user_agent= f'Aider/{__version__} https://aider.chat'

-def scrape_with_playwright(url):
-    with sync_playwright() as p:
-        try:
-            browser = p.chromium.launch()
-        except Exception as e:
-            print(repr(e))
-            return
+PLAYWRIGHT_INFO = '''
+For better web scraping, install Playwright chromium:

-        # user_agent = ??
-        page = browser.new_page()
-        user_agent = page.evaluate("navigator.userAgent")
-        print(f"User Agent: {user_agent}")
-        page.goto(url)
-        content = page.content()
-        browser.close()
+    playwright install --with-deps chromium

-    return content
+See https://aider.chat/docs/install.html#enable-playwright for more info.
+'''
+
+class Scraper:
+    playwright_available = None
+
+    def __init__(self, print_error=None):
+        if print_error:
+            self.print_error = print_error
+        else:
+            self.print_error = print
+
+    def scrape_with_playwright(self, url):
+        with sync_playwright() as p:
+            try:
+                browser = p.chromium.launch()
+            except Exception as e:
+                print(repr(e))
+                return
+
+            page = browser.new_page()
+
+            user_agent = page.evaluate("navigator.userAgent")
+            user_agent = user_agent.replace('Headless','')
+            user_agent = user_agent.replace('headless', '')
+            user_agent += ' ' + aider_user_agent
+
+            page = browser.new_page(user_agent=user_agent)
+            page.goto(url)
+            content = page.content()
+            browser.close()
+
+        return content
+
+    def try_playwright(self):
+        with sync_playwright() as p:
+            try:
+                browser = p.chromium.launch()
+                self.playwright_available = True
+            except Exception as e:
+                self.playwright_available = False
+                self.print_error(PLAYWRIGHT_INFO)
+
+    def scrape_with_httpx(self, url):
+        pass
+
+    def scrape(self, url):
+        if self.playwright_available is None:
+            self.try_playwright()
+
+        if self.playwright_available:
+            content = self.scrape_with_playwright(url)
+        else:
+            content = self.scrape_with_httpx(url)
+
+        return content
+
+def main(url):
+    scraper = Scraper()
+    content = scraper.scrape(url)
+    print(content)

 if __name__ == "__main__":
    if len(sys.argv) < 2: