output with pandoc, cleanup with bs and re

2025-05-31 01:35:00 +00:00 · 2024-02-08 14:14:42 -08:00 · 2024-02-08 14:14:42 -08:00 · cc36329691
commit cc36329691
parent be60b785a4
3 changed files with 35 additions and 20 deletions
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -1,9 +1,10 @@
 #!/usr/bin/env python

+import re
 import sys

-import pypandoc
 import httpx
+import pypandoc
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright

@ -94,11 +95,9 @@ class Scraper:
        else:
            content = self.scrape_with_httpx(url)

-        Path('tmp.html').write_text(content)
-
        if content:
            content = html_to_markdown(content)
-            #content = html_to_text(content)
+            # content = html_to_text(content)

        return content

@ -109,11 +108,8 @@ class Scraper:


 def html_to_text(page_source: str) -> str:
-
    soup = BeautifulSoup(page_source, "html.parser")

-    soup = slimdown_html(soup)
-
    for script in soup(["script", "style"]):
        script.extract()

@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str:


 def slimdown_html(soup):
-    # Remove all <img> tags
-    for img in soup.find_all('img'):
-        img.decompose()
-    # Remove all elements with data: URLs
-    for tag in soup.find_all(href=lambda x: x and x.startswith('data:')):
+    for svg in soup.find_all("svg"):
+        svg.decompose()
+
+    if soup.img:
+        soup.img.decompose()
+
+    for tag in soup.find_all(href=lambda x: x and x.startswith("data:")):
        tag.decompose()
-    for tag in soup.find_all(src=lambda x: x and x.startswith('data:')):
+
+    for tag in soup.find_all(src=lambda x: x and x.startswith("data:")):
        tag.decompose()
-    # Remove all per-element CSS styles
+
    for tag in soup.find_all(True):
-        tag.attrs.pop('style', None)
-    # Remove all internal anchor elements
-    for anchor in soup.find_all('a', href=True):
-        if anchor['href'].startswith('#'):
-            anchor.decompose()
+        tag.attrs.clear()
+
    return soup

+
 def html_to_markdown(page_source: str) -> str:
-    return pypandoc.convert_text(page_source, 'markdown', format='html')
+    soup = BeautifulSoup(page_source, "html.parser")
+    soup = slimdown_html(soup)
+    page_source = str(soup)
+
+    md = pypandoc.convert_text(page_source, "markdown", format="html")
+
+    md = re.sub(r"</div>", "      ", md)
+    md = re.sub(r"<div>", "     ", md)
+
+    md = re.sub(r"\n\s*\n", "\n\n", md)
+
+    return md
+

 def main(url):
    scraper = Scraper()
--- a/requirements.in
+++ b/requirements.in
@ -23,3 +23,5 @@ PyYAML
 Pillow
 diff-match-patch
 playwright
+pypandoc
+pypandoc_binary
--- a/requirements.txt
+++ b/requirements.txt
@ -96,6 +96,10 @@ pyee==11.0.1
    # via playwright
 pygments==2.17.2
    # via rich
+pypandoc==1.12
+    # via -r requirements.in
+pypandoc-binary==1.12
+    # via -r requirements.in
 pyyaml==6.0.1
    # via -r requirements.in
 referencing==0.32.0