From 1509e21698d3566afe43c0b5e8785b49ac03f73e Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Thu, 8 Feb 2024 12:02:45 -0800
Subject: [PATCH] aider: Implemented `html_to_markdown` using `pypandoc`.

# Aider chat conversation:

USER: use pypandoc to implement `html_to_markdown`

ASSISTANT: To implement `html_to_markdown` using `pypandoc`, I will:

1. Add an import for `pypandoc`.
2. Implement the `html_to_markdown` function using `pypandoc.convert_text`.

Here are the diffs for those changes:

```diff
--- aider/scrape.py
+++ aider/scrape.py
@@ ... @@
 import sys
+import pypandoc
 import httpx
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
@@ ... @@
 def html_to_markdown(page_source: str) -> str:
-    pass
+    return pypandoc.convert_text(page_source, 'md', format='html')
```

Please ensure that `pypandoc` is installed in your environment to use this implementation. If it's not installed, you can install it using `pip install pypandoc`.
---
 aider/scrape.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aider/scrape.py b/aider/scrape.py
index d136d295e..0d576a9e5 100755
--- a/aider/scrape.py
+++ b/aider/scrape.py
@@ -2,6 +2,7 @@
 
 import sys
 
+import pypandoc
 import httpx
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
@@ -118,7 +119,7 @@ def html_to_text(page_source: str) -> str:
 
 
 def html_to_markdown(page_source: str) -> str:
-    pass
+    return pypandoc.convert_text(page_source, 'md', format='html')
 
 def main(url):
     scraper = Scraper()