diff --git a/aider/commands.py b/aider/commands.py index f7f92e793..4877fefbc 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -37,11 +37,14 @@ class Commands: if not self.scraper: self.scraper = Scraper(print_error=self.io.tool_error) - content = self.scraper.scrape(url) + content = self.scraper.scrape(url) or "" if content: self.io.tool_output(content) + self.scraper.show_playwright_instructions() + content = f"{url}:\n\n" + content + return content def is_command(self, inp): diff --git a/aider/scrape.py b/aider/scrape.py index 659f4168e..e6110a2b5 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -7,6 +7,7 @@ import httpx import pypandoc from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright +from pypandoc.pandoc_download import download_pandoc from aider import __version__ @@ -22,6 +23,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info. class Scraper: + pandoc_available = None playwright_available = None playwright_instructions_shown = False @@ -95,29 +97,44 @@ class Scraper: else: content = self.scrape_with_httpx(url) - if content: - content = html_to_markdown(content) - # content = html_to_text(content) + if not content: + return + + self.try_pandoc() + + content = self.html_to_markdown(content) + # content = html_to_text(content) return content + def try_pandoc(self): + if self.pandoc_available: + return -# Adapted from AutoGPT, MIT License -# -# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173 + html = "" + try: + pypandoc.convert_text(html, "markdown", format="html") + self.pandoc_available = True + return + except OSError: + pass + download_pandoc() + self.pandoc_available = True -def html_to_text(page_source: str) -> str: - soup = BeautifulSoup(page_source, "html.parser") + def html_to_markdown(self, page_source): + soup = BeautifulSoup(page_source, "html.parser") + soup = slimdown_html(soup) + page_source = str(soup) - for script in soup(["script", "style"]): - script.extract() + md = pypandoc.convert_text(page_source, "markdown", format="html") - text = soup.get_text() - lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - text = "\n".join(chunk for chunk in chunks if chunk) - return text + md = re.sub(r"", " ", md) + md = re.sub(r"
", " ", md) + + md = re.sub(r"\n\s*\n", "\n\n", md) + + return md def slimdown_html(soup): @@ -141,19 +158,22 @@ def slimdown_html(soup): return soup -def html_to_markdown(page_source: str) -> str: +# Adapted from AutoGPT, MIT License +# +# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173 + + +def html_to_text(page_source: str) -> str: soup = BeautifulSoup(page_source, "html.parser") - soup = slimdown_html(soup) - page_source = str(soup) - md = pypandoc.convert_text(page_source, "markdown", format="html") + for script in soup(["script", "style"]): + script.extract() - md = re.sub(r"
", " ", md) - md = re.sub(r"
", " ", md) - - md = re.sub(r"\n\s*\n", "\n\n", md) - - return md + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = "\n".join(chunk for chunk in chunks if chunk) + return text def main(url): diff --git a/requirements.in b/requirements.in index 717bdabf6..d55382942 100644 --- a/requirements.in +++ b/requirements.in @@ -24,4 +24,3 @@ Pillow diff-match-patch playwright pypandoc -pypandoc_binary diff --git a/requirements.txt b/requirements.txt index 8ceced105..3f5ed1932 100644 --- a/requirements.txt +++ b/requirements.txt @@ -96,8 +96,6 @@ pygments==2.17.2 # via rich pypandoc==1.12 # via -r requirements.in -pypandoc-binary==1.12 - # via -r requirements.in pyyaml==6.0.1 # via -r requirements.in referencing==0.32.0