feat: add error handling for pypandoc conversion in Scraper class

This commit is contained in:
Paul Gauthier 2024-09-03 08:01:45 -07:00 committed by Paul Gauthier (aider)
parent be1e45a4b3
commit ef4a9dc4ca

View file

@ -222,7 +222,10 @@ class Scraper:
if not self.pandoc_available:
return page_source
md = pypandoc.convert_text(page_source, "markdown", format="html")
try:
md = pypandoc.convert_text(page_source, "markdown", format="html")
except OSError:
return page_source
md = re.sub(r"</div>", " ", md)
md = re.sub(r"<div>", " ", md)