Use download_pandoc, which works everywhere including arm64

2025-06-01 02:05:00 +00:00 · 2024-02-08 15:56:00 -08:00 · 2024-02-08 15:56:00 -08:00 · efff174f9a
commit efff174f9a
parent 2dee76378b
4 changed files with 49 additions and 29 deletions
--- a/aider/commands.py
+++ b/aider/commands.py
@ -37,11 +37,14 @@ class Commands:
        if not self.scraper:
            self.scraper = Scraper(print_error=self.io.tool_error)
-        content = self.scraper.scrape(url)
+        content = self.scraper.scrape(url) or ""
        if content:
            self.io.tool_output(content)
        self.scraper.show_playwright_instructions()
        content = f"{url}:\n\n" + content
        return content
    def is_command(self, inp):
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -7,6 +7,7 @@ import httpx
 import pypandoc
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 from pypandoc.pandoc_download import download_pandoc
 from aider import __version__
@ -22,6 +23,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info.
 class Scraper:
    pandoc_available = None
    playwright_available = None
    playwright_instructions_shown = False
@ -95,29 +97,44 @@ class Scraper:
        else:
            content = self.scrape_with_httpx(url)
-        if content:
+        if not content:
-            content = html_to_markdown(content)
+            return
        self.try_pandoc()
        content = self.html_to_markdown(content)
        # content = html_to_text(content)
        return content
    def try_pandoc(self):
        if self.pandoc_available:
            return
-# Adapted from AutoGPT, MIT License
+        html = "<body></body>"
-#
+        try:
-# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
+            pypandoc.convert_text(html, "markdown", format="html")
            self.pandoc_available = True
            return
        except OSError:
            pass
        download_pandoc()
        self.pandoc_available = True
-def html_to_text(page_source: str) -> str:
+    def html_to_markdown(self, page_source):
        soup = BeautifulSoup(page_source, "html.parser")
        soup = slimdown_html(soup)
        page_source = str(soup)
-    for script in soup(["script", "style"]):
+        md = pypandoc.convert_text(page_source, "markdown", format="html")
        script.extract()
-    text = soup.get_text()
+        md = re.sub(r"</div>", "      ", md)
-    lines = (line.strip() for line in text.splitlines())
+        md = re.sub(r"<div>", "     ", md)
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+
-    text = "\n".join(chunk for chunk in chunks if chunk)
+        md = re.sub(r"\n\s*\n", "\n\n", md)
-    return text
+
        return md
 def slimdown_html(soup):
@ -141,19 +158,22 @@ def slimdown_html(soup):
    return soup
-def html_to_markdown(page_source: str) -> str:
+# Adapted from AutoGPT, MIT License
 #
 # https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
 def html_to_text(page_source: str) -> str:
    soup = BeautifulSoup(page_source, "html.parser")
    soup = slimdown_html(soup)
    page_source = str(soup)
-    md = pypandoc.convert_text(page_source, "markdown", format="html")
+    for script in soup(["script", "style"]):
        script.extract()
-    md = re.sub(r"</div>", "      ", md)
+    text = soup.get_text()
-    md = re.sub(r"<div>", "     ", md)
+    lines = (line.strip() for line in text.splitlines())
-
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    md = re.sub(r"\n\s*\n", "\n\n", md)
+    text = "\n".join(chunk for chunk in chunks if chunk)
-
+    return text
    return md
 def main(url):
--- a/requirements.in
+++ b/requirements.in
@ -24,4 +24,3 @@ Pillow
 diff-match-patch
 playwright
 pypandoc
 pypandoc_binary
--- a/requirements.txt
+++ b/requirements.txt
@ -96,8 +96,6 @@ pygments==2.17.2
    # via rich
 pypandoc==1.12
    # via -r requirements.in
 pypandoc-binary==1.12
    # via -r requirements.in
 pyyaml==6.0.1
    # via -r requirements.in
 referencing==0.32.0