From 1509e21698d3566afe43c0b5e8785b49ac03f73e Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 8 Feb 2024 12:02:45 -0800 Subject: [PATCH] aider: Implemented `html_to_markdown` using `pypandoc`. # Aider chat conversation: USER: use pypandoc to implement `html_to_markdown` ASSISTANT: To implement `html_to_markdown` using `pypandoc`, I will: 1. Add an import for `pypandoc`. 2. Implement the `html_to_markdown` function using `pypandoc.convert_text`. Here are the diffs for those changes: ```diff --- aider/scrape.py +++ aider/scrape.py @@ ... @@ import sys +import pypandoc import httpx from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright @@ ... @@ def html_to_markdown(page_source: str) -> str: - pass + return pypandoc.convert_text(page_source, 'md', format='html') ``` Please ensure that `pypandoc` is installed in your environment to use this implementation. If it's not installed, you can install it using `pip install pypandoc`. --- aider/scrape.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aider/scrape.py b/aider/scrape.py index d136d295e..0d576a9e5 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -2,6 +2,7 @@ import sys +import pypandoc import httpx from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright @@ -118,7 +119,7 @@ def html_to_text(page_source: str) -> str: def html_to_markdown(page_source: str) -> str: - pass + return pypandoc.convert_text(page_source, 'md', format='html') def main(url): scraper = Scraper()