From b8313c5343bbf83e53b720597bf2035f7c6b538d Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 1 May 2024 15:14:14 -0700 Subject: [PATCH] added docstrings --- aider/scrape.py | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/aider/scrape.py b/aider/scrape.py index 21c888df2..650075905 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -12,6 +12,8 @@ from aider import __version__ aider_user_agent = f"Aider/{__version__} +https://aider.chat" +# Playwright is nice because it has a simple way to install dependencies on most +# platforms. PLAYWRIGHT_INFO = """ For better web scraping, install Playwright chromium with this command in your terminal: @@ -26,12 +28,40 @@ class Scraper: playwright_available = None playwright_instructions_shown = False + # Public API... def __init__(self, print_error=None): + """ + `print_error` - a function to call to print error/debug info. + """ if print_error: self.print_error = print_error else: self.print_error = print + def scrape(self, url): + """ + Scrape a url and turn it into readable markdown. + + `url` - the URLto scrape. + """ + self.try_playwright() + + if self.playwright_available: + content = self.scrape_with_playwright(url) + else: + content = self.scrape_with_httpx(url) + + if not content: + return + + self.try_pandoc() + + content = self.html_to_markdown(content) + # content = html_to_text(content) + + return content + + # Internals... def scrape_with_playwright(self, url): with sync_playwright() as p: try: @@ -88,24 +118,6 @@ class Scraper: self.print_error(f"An error occurred: {err}") return None - def scrape(self, url): - self.try_playwright() - - if self.playwright_available: - content = self.scrape_with_playwright(url) - else: - content = self.scrape_with_httpx(url) - - if not content: - return - - self.try_pandoc() - - content = self.html_to_markdown(content) - # content = html_to_text(content) - - return content - def try_pandoc(self): if self.pandoc_available: return