From b8313c5343bbf83e53b720597bf2035f7c6b538d Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Wed, 1 May 2024 15:14:14 -0700
Subject: [PATCH] added docstrings

---
 aider/scrape.py | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/aider/scrape.py b/aider/scrape.py
index 21c888df2..650075905 100755
--- a/aider/scrape.py
+++ b/aider/scrape.py
@@ -12,6 +12,8 @@ from aider import __version__
 
 aider_user_agent = f"Aider/{__version__} +https://aider.chat"
 
+# Playwright is nice because it has a simple way to install dependencies on most
+# platforms.
 PLAYWRIGHT_INFO = """
 For better web scraping, install Playwright chromium with this command in your terminal:
 
@@ -26,12 +28,40 @@ class Scraper:
     playwright_available = None
     playwright_instructions_shown = False
 
+    # Public API...
     def __init__(self, print_error=None):
+        """
+        `print_error` - a function to call to print error/debug info.
+        """
         if print_error:
             self.print_error = print_error
         else:
             self.print_error = print
 
+    def scrape(self, url):
+        """
+        Scrape a url and turn it into readable markdown.
+
+        `url` - the URLto scrape.
+        """
+        self.try_playwright()
+
+        if self.playwright_available:
+            content = self.scrape_with_playwright(url)
+        else:
+            content = self.scrape_with_httpx(url)
+
+        if not content:
+            return
+
+        self.try_pandoc()
+
+        content = self.html_to_markdown(content)
+        # content = html_to_text(content)
+
+        return content
+
+    # Internals...
     def scrape_with_playwright(self, url):
         with sync_playwright() as p:
             try:
@@ -88,24 +118,6 @@ class Scraper:
             self.print_error(f"An error occurred: {err}")
         return None
 
-    def scrape(self, url):
-        self.try_playwright()
-
-        if self.playwright_available:
-            content = self.scrape_with_playwright(url)
-        else:
-            content = self.scrape_with_httpx(url)
-
-        if not content:
-            return
-
-        self.try_pandoc()
-
-        content = self.html_to_markdown(content)
-        # content = html_to_text(content)
-
-        return content
-
     def try_pandoc(self):
         if self.pandoc_available:
             return