Use download_pandoc, which works everywhere including arm64

This commit is contained in:
Paul Gauthier 2024-02-08 15:56:00 -08:00
parent 2dee76378b
commit efff174f9a
4 changed files with 49 additions and 29 deletions

View file

@ -37,11 +37,14 @@ class Commands:
if not self.scraper: if not self.scraper:
self.scraper = Scraper(print_error=self.io.tool_error) self.scraper = Scraper(print_error=self.io.tool_error)
content = self.scraper.scrape(url) content = self.scraper.scrape(url) or ""
if content: if content:
self.io.tool_output(content) self.io.tool_output(content)
self.scraper.show_playwright_instructions() self.scraper.show_playwright_instructions()
content = f"{url}:\n\n" + content
return content return content
def is_command(self, inp): def is_command(self, inp):

View file

@ -7,6 +7,7 @@ import httpx
import pypandoc import pypandoc
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from pypandoc.pandoc_download import download_pandoc
from aider import __version__ from aider import __version__
@ -22,6 +23,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info.
class Scraper: class Scraper:
pandoc_available = None
playwright_available = None playwright_available = None
playwright_instructions_shown = False playwright_instructions_shown = False
@ -95,29 +97,44 @@ class Scraper:
else: else:
content = self.scrape_with_httpx(url) content = self.scrape_with_httpx(url)
if content: if not content:
content = html_to_markdown(content) return
self.try_pandoc()
content = self.html_to_markdown(content)
# content = html_to_text(content) # content = html_to_text(content)
return content return content
def try_pandoc(self):
if self.pandoc_available:
return
# Adapted from AutoGPT, MIT License html = "<body></body>"
# try:
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173 pypandoc.convert_text(html, "markdown", format="html")
self.pandoc_available = True
return
except OSError:
pass
download_pandoc()
self.pandoc_available = True
def html_to_text(page_source: str) -> str: def html_to_markdown(self, page_source):
soup = BeautifulSoup(page_source, "html.parser") soup = BeautifulSoup(page_source, "html.parser")
soup = slimdown_html(soup)
page_source = str(soup)
for script in soup(["script", "style"]): md = pypandoc.convert_text(page_source, "markdown", format="html")
script.extract()
text = soup.get_text() md = re.sub(r"</div>", " ", md)
lines = (line.strip() for line in text.splitlines()) md = re.sub(r"<div>", " ", md)
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk) md = re.sub(r"\n\s*\n", "\n\n", md)
return text
return md
def slimdown_html(soup): def slimdown_html(soup):
@ -141,19 +158,22 @@ def slimdown_html(soup):
return soup return soup
def html_to_markdown(page_source: str) -> str: # Adapted from AutoGPT, MIT License
#
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
def html_to_text(page_source: str) -> str:
soup = BeautifulSoup(page_source, "html.parser") soup = BeautifulSoup(page_source, "html.parser")
soup = slimdown_html(soup)
page_source = str(soup)
md = pypandoc.convert_text(page_source, "markdown", format="html") for script in soup(["script", "style"]):
script.extract()
md = re.sub(r"</div>", " ", md) text = soup.get_text()
md = re.sub(r"<div>", " ", md) lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
md = re.sub(r"\n\s*\n", "\n\n", md) text = "\n".join(chunk for chunk in chunks if chunk)
return text
return md
def main(url): def main(url):

View file

@ -24,4 +24,3 @@ Pillow
diff-match-patch diff-match-patch
playwright playwright
pypandoc pypandoc
pypandoc_binary

View file

@ -96,8 +96,6 @@ pygments==2.17.2
# via rich # via rich
pypandoc==1.12 pypandoc==1.12
# via -r requirements.in # via -r requirements.in
pypandoc-binary==1.12
# via -r requirements.in
pyyaml==6.0.1 pyyaml==6.0.1
# via -r requirements.in # via -r requirements.in
referencing==0.32.0 referencing==0.32.0