Add functionality to write scraped content to a temporary file and optimize HTML parsing.

This commit is contained in:
Paul Gauthier 2024-02-08 12:11:59 -08:00
parent 33a62c8dc6
commit 84ae51c005

View file

@ -94,6 +94,8 @@ class Scraper:
else: else:
content = self.scrape_with_httpx(url) content = self.scrape_with_httpx(url)
Path('tmp.html').write_text(content)
if content: if content:
content = html_to_markdown(content) content = html_to_markdown(content)
#content = html_to_text(content) #content = html_to_text(content)
@ -107,8 +109,11 @@ class Scraper:
def html_to_text(page_source: str) -> str: def html_to_text(page_source: str) -> str:
soup = BeautifulSoup(page_source, "html.parser") soup = BeautifulSoup(page_source, "html.parser")
soup = slimdown_html(soup)
for script in soup(["script", "style"]): for script in soup(["script", "style"]):
script.extract() script.extract()
@ -119,8 +124,7 @@ def html_to_text(page_source: str) -> str:
return text return text
def slimdown_html(page_source: str) -> str: def slimdown_html(soup):
soup = BeautifulSoup(page_source, "html.parser")
# Remove all <img> tags # Remove all <img> tags
for img in soup.find_all('img'): for img in soup.find_all('img'):
img.decompose() img.decompose()
@ -131,7 +135,7 @@ def slimdown_html(page_source: str) -> str:
for anchor in soup.find_all('a', href=True): for anchor in soup.find_all('a', href=True):
if anchor['href'].startswith('#'): if anchor['href'].startswith('#'):
anchor.decompose() anchor.decompose()
return str(soup) return soup
def html_to_markdown(page_source: str) -> str: def html_to_markdown(page_source: str) -> str:
return pypandoc.convert_text(page_source, 'markdown', format='html') return pypandoc.convert_text(page_source, 'markdown', format='html')