mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 01:35:00 +00:00
output with pandoc, cleanup with bs and re
This commit is contained in:
parent
be60b785a4
commit
cc36329691
3 changed files with 35 additions and 20 deletions
|
@ -1,9 +1,10 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
import pypandoc
|
||||
import httpx
|
||||
import pypandoc
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
|
@ -94,11 +95,9 @@ class Scraper:
|
|||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
Path('tmp.html').write_text(content)
|
||||
|
||||
if content:
|
||||
content = html_to_markdown(content)
|
||||
#content = html_to_text(content)
|
||||
# content = html_to_text(content)
|
||||
|
||||
return content
|
||||
|
||||
|
@ -109,11 +108,8 @@ class Scraper:
|
|||
|
||||
|
||||
def html_to_text(page_source: str) -> str:
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
soup = slimdown_html(soup)
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
|
@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str:
|
|||
|
||||
|
||||
def slimdown_html(soup):
|
||||
# Remove all <img> tags
|
||||
for img in soup.find_all('img'):
|
||||
img.decompose()
|
||||
# Remove all elements with data: URLs
|
||||
for tag in soup.find_all(href=lambda x: x and x.startswith('data:')):
|
||||
for svg in soup.find_all("svg"):
|
||||
svg.decompose()
|
||||
|
||||
if soup.img:
|
||||
soup.img.decompose()
|
||||
|
||||
for tag in soup.find_all(href=lambda x: x and x.startswith("data:")):
|
||||
tag.decompose()
|
||||
for tag in soup.find_all(src=lambda x: x and x.startswith('data:')):
|
||||
|
||||
for tag in soup.find_all(src=lambda x: x and x.startswith("data:")):
|
||||
tag.decompose()
|
||||
# Remove all per-element CSS styles
|
||||
|
||||
for tag in soup.find_all(True):
|
||||
tag.attrs.pop('style', None)
|
||||
# Remove all internal anchor elements
|
||||
for anchor in soup.find_all('a', href=True):
|
||||
if anchor['href'].startswith('#'):
|
||||
anchor.decompose()
|
||||
tag.attrs.clear()
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
def html_to_markdown(page_source: str) -> str:
|
||||
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
soup = slimdown_html(soup)
|
||||
page_source = str(soup)
|
||||
|
||||
md = pypandoc.convert_text(page_source, "markdown", format="html")
|
||||
|
||||
md = re.sub(r"</div>", " ", md)
|
||||
md = re.sub(r"<div>", " ", md)
|
||||
|
||||
md = re.sub(r"\n\s*\n", "\n\n", md)
|
||||
|
||||
return md
|
||||
|
||||
|
||||
def main(url):
|
||||
scraper = Scraper()
|
||||
|
|
|
@ -23,3 +23,5 @@ PyYAML
|
|||
Pillow
|
||||
diff-match-patch
|
||||
playwright
|
||||
pypandoc
|
||||
pypandoc_binary
|
||||
|
|
|
@ -96,6 +96,10 @@ pyee==11.0.1
|
|||
# via playwright
|
||||
pygments==2.17.2
|
||||
# via rich
|
||||
pypandoc==1.12
|
||||
# via -r requirements.in
|
||||
pypandoc-binary==1.12
|
||||
# via -r requirements.in
|
||||
pyyaml==6.0.1
|
||||
# via -r requirements.in
|
||||
referencing==0.32.0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue