mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-28 16:25:00 +00:00
Use download_pandoc, which works everywhere including arm64
This commit is contained in:
parent
2dee76378b
commit
efff174f9a
4 changed files with 49 additions and 29 deletions
|
@ -37,11 +37,14 @@ class Commands:
|
|||
if not self.scraper:
|
||||
self.scraper = Scraper(print_error=self.io.tool_error)
|
||||
|
||||
content = self.scraper.scrape(url)
|
||||
content = self.scraper.scrape(url) or ""
|
||||
if content:
|
||||
self.io.tool_output(content)
|
||||
|
||||
self.scraper.show_playwright_instructions()
|
||||
|
||||
content = f"{url}:\n\n" + content
|
||||
|
||||
return content
|
||||
|
||||
def is_command(self, inp):
|
||||
|
|
|
@ -7,6 +7,7 @@ import httpx
|
|||
import pypandoc
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
from pypandoc.pandoc_download import download_pandoc
|
||||
|
||||
from aider import __version__
|
||||
|
||||
|
@ -22,6 +23,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info.
|
|||
|
||||
|
||||
class Scraper:
|
||||
pandoc_available = None
|
||||
playwright_available = None
|
||||
playwright_instructions_shown = False
|
||||
|
||||
|
@ -95,29 +97,44 @@ class Scraper:
|
|||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
if content:
|
||||
content = html_to_markdown(content)
|
||||
# content = html_to_text(content)
|
||||
if not content:
|
||||
return
|
||||
|
||||
self.try_pandoc()
|
||||
|
||||
content = self.html_to_markdown(content)
|
||||
# content = html_to_text(content)
|
||||
|
||||
return content
|
||||
|
||||
def try_pandoc(self):
|
||||
if self.pandoc_available:
|
||||
return
|
||||
|
||||
# Adapted from AutoGPT, MIT License
|
||||
#
|
||||
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
|
||||
html = "<body></body>"
|
||||
try:
|
||||
pypandoc.convert_text(html, "markdown", format="html")
|
||||
self.pandoc_available = True
|
||||
return
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
download_pandoc()
|
||||
self.pandoc_available = True
|
||||
|
||||
def html_to_text(page_source: str) -> str:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
def html_to_markdown(self, page_source):
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
soup = slimdown_html(soup)
|
||||
page_source = str(soup)
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
md = pypandoc.convert_text(page_source, "markdown", format="html")
|
||||
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
return text
|
||||
md = re.sub(r"</div>", " ", md)
|
||||
md = re.sub(r"<div>", " ", md)
|
||||
|
||||
md = re.sub(r"\n\s*\n", "\n\n", md)
|
||||
|
||||
return md
|
||||
|
||||
|
||||
def slimdown_html(soup):
|
||||
|
@ -141,19 +158,22 @@ def slimdown_html(soup):
|
|||
return soup
|
||||
|
||||
|
||||
def html_to_markdown(page_source: str) -> str:
|
||||
# Adapted from AutoGPT, MIT License
|
||||
#
|
||||
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
|
||||
|
||||
|
||||
def html_to_text(page_source: str) -> str:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
soup = slimdown_html(soup)
|
||||
page_source = str(soup)
|
||||
|
||||
md = pypandoc.convert_text(page_source, "markdown", format="html")
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
md = re.sub(r"</div>", " ", md)
|
||||
md = re.sub(r"<div>", " ", md)
|
||||
|
||||
md = re.sub(r"\n\s*\n", "\n\n", md)
|
||||
|
||||
return md
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||
return text
|
||||
|
||||
|
||||
def main(url):
|
||||
|
|
|
@ -24,4 +24,3 @@ Pillow
|
|||
diff-match-patch
|
||||
playwright
|
||||
pypandoc
|
||||
pypandoc_binary
|
||||
|
|
|
@ -96,8 +96,6 @@ pygments==2.17.2
|
|||
# via rich
|
||||
pypandoc==1.12
|
||||
# via -r requirements.in
|
||||
pypandoc-binary==1.12
|
||||
# via -r requirements.in
|
||||
pyyaml==6.0.1
|
||||
# via -r requirements.in
|
||||
referencing==0.32.0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue