From 1520bb976a29f6a601a23c25661ae5e3e1f0c319 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 8 Feb 2024 12:10:10 -0800 Subject: [PATCH] Change html_to_text to html_to_markdown and add slimdown_html function. --- aider/scrape.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/aider/scrape.py b/aider/scrape.py index 0d576a9e5..60c292a1f 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -95,7 +95,8 @@ class Scraper: content = self.scrape_with_httpx(url) if content: - content = html_to_text(content) + content = html_to_markdown(content) + #content = html_to_text(content) return content @@ -118,8 +119,12 @@ def html_to_text(page_source: str) -> str: return text +def slimdown_html(page_source: str) -> str: + soup = BeautifulSoup(page_source, "html.parser") + # ... + def html_to_markdown(page_source: str) -> str: - return pypandoc.convert_text(page_source, 'md', format='html') + return pypandoc.convert_text(page_source, 'markdown', format='html') def main(url): scraper = Scraper()