output with pandoc, cleanup with bs and re

This commit is contained in:
Paul Gauthier 2024-02-08 14:14:42 -08:00
parent be60b785a4
commit cc36329691
3 changed files with 35 additions and 20 deletions

View file

@ -1,9 +1,10 @@
#!/usr/bin/env python #!/usr/bin/env python
import re
import sys import sys
import pypandoc
import httpx import httpx
import pypandoc
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
@ -94,11 +95,9 @@ class Scraper:
else: else:
content = self.scrape_with_httpx(url) content = self.scrape_with_httpx(url)
Path('tmp.html').write_text(content)
if content: if content:
content = html_to_markdown(content) content = html_to_markdown(content)
#content = html_to_text(content) # content = html_to_text(content)
return content return content
@ -109,11 +108,8 @@ class Scraper:
def html_to_text(page_source: str) -> str: def html_to_text(page_source: str) -> str:
soup = BeautifulSoup(page_source, "html.parser") soup = BeautifulSoup(page_source, "html.parser")
soup = slimdown_html(soup)
for script in soup(["script", "style"]): for script in soup(["script", "style"]):
script.extract() script.extract()
@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str:
def slimdown_html(soup): def slimdown_html(soup):
# Remove all <img> tags for svg in soup.find_all("svg"):
for img in soup.find_all('img'): svg.decompose()
img.decompose()
# Remove all elements with data: URLs if soup.img:
for tag in soup.find_all(href=lambda x: x and x.startswith('data:')): soup.img.decompose()
for tag in soup.find_all(href=lambda x: x and x.startswith("data:")):
tag.decompose() tag.decompose()
for tag in soup.find_all(src=lambda x: x and x.startswith('data:')):
for tag in soup.find_all(src=lambda x: x and x.startswith("data:")):
tag.decompose() tag.decompose()
# Remove all per-element CSS styles
for tag in soup.find_all(True): for tag in soup.find_all(True):
tag.attrs.pop('style', None) tag.attrs.clear()
# Remove all internal anchor elements
for anchor in soup.find_all('a', href=True):
if anchor['href'].startswith('#'):
anchor.decompose()
return soup return soup
def html_to_markdown(page_source: str) -> str: def html_to_markdown(page_source: str) -> str:
return pypandoc.convert_text(page_source, 'markdown', format='html') soup = BeautifulSoup(page_source, "html.parser")
soup = slimdown_html(soup)
page_source = str(soup)
md = pypandoc.convert_text(page_source, "markdown", format="html")
md = re.sub(r"</div>", " ", md)
md = re.sub(r"<div>", " ", md)
md = re.sub(r"\n\s*\n", "\n\n", md)
return md
def main(url): def main(url):
scraper = Scraper() scraper = Scraper()

View file

@ -23,3 +23,5 @@ PyYAML
Pillow Pillow
diff-match-patch diff-match-patch
playwright playwright
pypandoc
pypandoc_binary

View file

@ -96,6 +96,10 @@ pyee==11.0.1
# via playwright # via playwright
pygments==2.17.2 pygments==2.17.2
# via rich # via rich
pypandoc==1.12
# via -r requirements.in
pypandoc-binary==1.12
# via -r requirements.in
pyyaml==6.0.1 pyyaml==6.0.1
# via -r requirements.in # via -r requirements.in
referencing==0.32.0 referencing==0.32.0