output with pandoc, cleanup with bs and re

2025-06-02 18:54:59 +00:00 · 2024-02-08 14:14:42 -08:00 · 2024-02-08 14:14:42 -08:00 · cc36329691
commit cc36329691
parent be60b785a4
3 changed files with 35 additions and 20 deletions
--- a/aider/scrape.py
+++ b/aider/scrape.py
@ -1,9 +1,10 @@
 #!/usr/bin/env python
 import re
 import sys
 import pypandoc
 import httpx
 import pypandoc
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
@ -94,11 +95,9 @@ class Scraper:
        else:
            content = self.scrape_with_httpx(url)
        Path('tmp.html').write_text(content)
        if content:
            content = html_to_markdown(content)
-            #content = html_to_text(content)
+            # content = html_to_text(content)
        return content
@ -109,11 +108,8 @@ class Scraper:
 def html_to_text(page_source: str) -> str:
    soup = BeautifulSoup(page_source, "html.parser")
    soup = slimdown_html(soup)
    for script in soup(["script", "style"]):
        script.extract()
@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str:
 def slimdown_html(soup):
-    # Remove all <img> tags
+    for svg in soup.find_all("svg"):
-    for img in soup.find_all('img'):
+        svg.decompose()
-        img.decompose()
+
-    # Remove all elements with data: URLs
+    if soup.img:
-    for tag in soup.find_all(href=lambda x: x and x.startswith('data:')):
+        soup.img.decompose()
    for tag in soup.find_all(href=lambda x: x and x.startswith("data:")):
        tag.decompose()
-    for tag in soup.find_all(src=lambda x: x and x.startswith('data:')):
+
    for tag in soup.find_all(src=lambda x: x and x.startswith("data:")):
        tag.decompose()
-    # Remove all per-element CSS styles
+
    for tag in soup.find_all(True):
-        tag.attrs.pop('style', None)
+        tag.attrs.clear()
-    # Remove all internal anchor elements
+
    for anchor in soup.find_all('a', href=True):
        if anchor['href'].startswith('#'):
            anchor.decompose()
    return soup
 def html_to_markdown(page_source: str) -> str:
-    return pypandoc.convert_text(page_source, 'markdown', format='html')
+    soup = BeautifulSoup(page_source, "html.parser")
    soup = slimdown_html(soup)
    page_source = str(soup)
    md = pypandoc.convert_text(page_source, "markdown", format="html")
    md = re.sub(r"</div>", "      ", md)
    md = re.sub(r"<div>", "     ", md)
    md = re.sub(r"\n\s*\n", "\n\n", md)
    return md
 def main(url):
    scraper = Scraper()
--- a/requirements.in
+++ b/requirements.in
@ -23,3 +23,5 @@ PyYAML
 Pillow
 diff-match-patch
 playwright
 pypandoc
 pypandoc_binary
--- a/requirements.txt
+++ b/requirements.txt
@ -96,6 +96,10 @@ pyee==11.0.1
    # via playwright
 pygments==2.17.2
    # via rich
 pypandoc==1.12
    # via -r requirements.in
 pypandoc-binary==1.12
    # via -r requirements.in
 pyyaml==6.0.1
    # via -r requirements.in
 referencing==0.32.0