mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 02:05:00 +00:00
Use download_pandoc, which works everywhere including arm64
This commit is contained in:
parent
2dee76378b
commit
efff174f9a
4 changed files with 49 additions and 29 deletions
|
@ -37,11 +37,14 @@ class Commands:
|
||||||
if not self.scraper:
|
if not self.scraper:
|
||||||
self.scraper = Scraper(print_error=self.io.tool_error)
|
self.scraper = Scraper(print_error=self.io.tool_error)
|
||||||
|
|
||||||
content = self.scraper.scrape(url)
|
content = self.scraper.scrape(url) or ""
|
||||||
if content:
|
if content:
|
||||||
self.io.tool_output(content)
|
self.io.tool_output(content)
|
||||||
|
|
||||||
self.scraper.show_playwright_instructions()
|
self.scraper.show_playwright_instructions()
|
||||||
|
|
||||||
|
content = f"{url}:\n\n" + content
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def is_command(self, inp):
|
def is_command(self, inp):
|
||||||
|
|
|
@ -7,6 +7,7 @@ import httpx
|
||||||
import pypandoc
|
import pypandoc
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
from pypandoc.pandoc_download import download_pandoc
|
||||||
|
|
||||||
from aider import __version__
|
from aider import __version__
|
||||||
|
|
||||||
|
@ -22,6 +23,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info.
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
|
pandoc_available = None
|
||||||
playwright_available = None
|
playwright_available = None
|
||||||
playwright_instructions_shown = False
|
playwright_instructions_shown = False
|
||||||
|
|
||||||
|
@ -95,29 +97,44 @@ class Scraper:
|
||||||
else:
|
else:
|
||||||
content = self.scrape_with_httpx(url)
|
content = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
if content:
|
if not content:
|
||||||
content = html_to_markdown(content)
|
return
|
||||||
|
|
||||||
|
self.try_pandoc()
|
||||||
|
|
||||||
|
content = self.html_to_markdown(content)
|
||||||
# content = html_to_text(content)
|
# content = html_to_text(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
def try_pandoc(self):
|
||||||
|
if self.pandoc_available:
|
||||||
|
return
|
||||||
|
|
||||||
# Adapted from AutoGPT, MIT License
|
html = "<body></body>"
|
||||||
#
|
try:
|
||||||
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
|
pypandoc.convert_text(html, "markdown", format="html")
|
||||||
|
self.pandoc_available = True
|
||||||
|
return
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
download_pandoc()
|
||||||
|
self.pandoc_available = True
|
||||||
|
|
||||||
def html_to_text(page_source: str) -> str:
|
def html_to_markdown(self, page_source):
|
||||||
soup = BeautifulSoup(page_source, "html.parser")
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
soup = slimdown_html(soup)
|
||||||
|
page_source = str(soup)
|
||||||
|
|
||||||
for script in soup(["script", "style"]):
|
md = pypandoc.convert_text(page_source, "markdown", format="html")
|
||||||
script.extract()
|
|
||||||
|
|
||||||
text = soup.get_text()
|
md = re.sub(r"</div>", " ", md)
|
||||||
lines = (line.strip() for line in text.splitlines())
|
md = re.sub(r"<div>", " ", md)
|
||||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
||||||
text = "\n".join(chunk for chunk in chunks if chunk)
|
md = re.sub(r"\n\s*\n", "\n\n", md)
|
||||||
return text
|
|
||||||
|
return md
|
||||||
|
|
||||||
|
|
||||||
def slimdown_html(soup):
|
def slimdown_html(soup):
|
||||||
|
@ -141,19 +158,22 @@ def slimdown_html(soup):
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def html_to_markdown(page_source: str) -> str:
|
# Adapted from AutoGPT, MIT License
|
||||||
|
#
|
||||||
|
# https://github.com/Significant-Gravitas/AutoGPT/blob/fe0923ba6c9abb42ac4df79da580e8a4391e0418/autogpts/autogpt/autogpt/commands/web_selenium.py#L173
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(page_source: str) -> str:
|
||||||
soup = BeautifulSoup(page_source, "html.parser")
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
soup = slimdown_html(soup)
|
|
||||||
page_source = str(soup)
|
|
||||||
|
|
||||||
md = pypandoc.convert_text(page_source, "markdown", format="html")
|
for script in soup(["script", "style"]):
|
||||||
|
script.extract()
|
||||||
|
|
||||||
md = re.sub(r"</div>", " ", md)
|
text = soup.get_text()
|
||||||
md = re.sub(r"<div>", " ", md)
|
lines = (line.strip() for line in text.splitlines())
|
||||||
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||||
md = re.sub(r"\n\s*\n", "\n\n", md)
|
text = "\n".join(chunk for chunk in chunks if chunk)
|
||||||
|
return text
|
||||||
return md
|
|
||||||
|
|
||||||
|
|
||||||
def main(url):
|
def main(url):
|
||||||
|
|
|
@ -24,4 +24,3 @@ Pillow
|
||||||
diff-match-patch
|
diff-match-patch
|
||||||
playwright
|
playwright
|
||||||
pypandoc
|
pypandoc
|
||||||
pypandoc_binary
|
|
||||||
|
|
|
@ -96,8 +96,6 @@ pygments==2.17.2
|
||||||
# via rich
|
# via rich
|
||||||
pypandoc==1.12
|
pypandoc==1.12
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
pypandoc-binary==1.12
|
|
||||||
# via -r requirements.in
|
|
||||||
pyyaml==6.0.1
|
pyyaml==6.0.1
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
referencing==0.32.0
|
referencing==0.32.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue