From cc3632969100db7f9da7d6588253cf885254ce65 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Thu, 8 Feb 2024 14:14:42 -0800
Subject: [PATCH] output with pandoc, cleanup with bs and re

---
 aider/scrape.py  | 49 ++++++++++++++++++++++++++++--------------------
 requirements.in  |  2 ++
 requirements.txt |  4 ++++
 3 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/aider/scrape.py b/aider/scrape.py
index f4c3189e8..58989e913 100755
--- a/aider/scrape.py
+++ b/aider/scrape.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
 
+import re
 import sys
 
-import pypandoc
 import httpx
+import pypandoc
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 
@@ -94,11 +95,9 @@ class Scraper:
         else:
             content = self.scrape_with_httpx(url)
 
-        Path('tmp.html').write_text(content)
-
         if content:
             content = html_to_markdown(content)
-            #content = html_to_text(content)
+            # content = html_to_text(content)
 
         return content
 
@@ -109,11 +108,8 @@ class Scraper:
 
 
 def html_to_text(page_source: str) -> str:
-
     soup = BeautifulSoup(page_source, "html.parser")
 
-    soup = slimdown_html(soup)
-
     for script in soup(["script", "style"]):
         script.extract()
 
@@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str:
 
 
 def slimdown_html(soup):
-    # Remove all <img> tags
-    for img in soup.find_all('img'):
-        img.decompose()
-    # Remove all elements with data: URLs
-    for tag in soup.find_all(href=lambda x: x and x.startswith('data:')):
+    for svg in soup.find_all("svg"):
+        svg.decompose()
+
+    if soup.img:
+        soup.img.decompose()
+
+    for tag in soup.find_all(href=lambda x: x and x.startswith("data:")):
         tag.decompose()
-    for tag in soup.find_all(src=lambda x: x and x.startswith('data:')):
+
+    for tag in soup.find_all(src=lambda x: x and x.startswith("data:")):
         tag.decompose()
-    # Remove all per-element CSS styles
+
     for tag in soup.find_all(True):
-        tag.attrs.pop('style', None)
-    # Remove all internal anchor elements
-    for anchor in soup.find_all('a', href=True):
-        if anchor['href'].startswith('#'):
-            anchor.decompose()
+        tag.attrs.clear()
+
     return soup
 
+
 def html_to_markdown(page_source: str) -> str:
-    return pypandoc.convert_text(page_source, 'markdown', format='html')
+    soup = BeautifulSoup(page_source, "html.parser")
+    soup = slimdown_html(soup)
+    page_source = str(soup)
+
+    md = pypandoc.convert_text(page_source, "markdown", format="html")
+
+    md = re.sub(r"</div>", "      ", md)
+    md = re.sub(r"<div>", "     ", md)
+
+    md = re.sub(r"\n\s*\n", "\n\n", md)
+
+    return md
+
 
 def main(url):
     scraper = Scraper()
diff --git a/requirements.in b/requirements.in
index 200fc442e..37137af2a 100644
--- a/requirements.in
+++ b/requirements.in
@@ -23,3 +23,5 @@ PyYAML
 Pillow
 diff-match-patch
 playwright
+pypandoc
+pypandoc_binary
diff --git a/requirements.txt b/requirements.txt
index 52545514f..201eb1a0a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -96,6 +96,10 @@ pyee==11.0.1
     # via playwright
 pygments==2.17.2
     # via rich
+pypandoc==1.12
+    # via -r requirements.in
+pypandoc-binary==1.12
+    # via -r requirements.in
 pyyaml==6.0.1
     # via -r requirements.in
 referencing==0.32.0