From cc3632969100db7f9da7d6588253cf885254ce65 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 8 Feb 2024 14:14:42 -0800 Subject: [PATCH] output with pandoc, cleanup with bs and re --- aider/scrape.py | 49 ++++++++++++++++++++++++++++-------------------- requirements.in | 2 ++ requirements.txt | 4 ++++ 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/aider/scrape.py b/aider/scrape.py index f4c3189e8..58989e913 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -1,9 +1,10 @@ #!/usr/bin/env python +import re import sys -import pypandoc import httpx +import pypandoc from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright @@ -94,11 +95,9 @@ class Scraper: else: content = self.scrape_with_httpx(url) - Path('tmp.html').write_text(content) - if content: content = html_to_markdown(content) - #content = html_to_text(content) + # content = html_to_text(content) return content @@ -109,11 +108,8 @@ class Scraper: def html_to_text(page_source: str) -> str: - soup = BeautifulSoup(page_source, "html.parser") - soup = slimdown_html(soup) - for script in soup(["script", "style"]): script.extract() @@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str: def slimdown_html(soup): - # Remove all tags - for img in soup.find_all('img'): - img.decompose() - # Remove all elements with data: URLs - for tag in soup.find_all(href=lambda x: x and x.startswith('data:')): + for svg in soup.find_all("svg"): + svg.decompose() + + if soup.img: + soup.img.decompose() + + for tag in soup.find_all(href=lambda x: x and x.startswith("data:")): tag.decompose() - for tag in soup.find_all(src=lambda x: x and x.startswith('data:')): + + for tag in soup.find_all(src=lambda x: x and x.startswith("data:")): tag.decompose() - # Remove all per-element CSS styles + for tag in soup.find_all(True): - tag.attrs.pop('style', None) - # Remove all internal anchor elements - for anchor in soup.find_all('a', href=True): - if anchor['href'].startswith('#'): - anchor.decompose() + tag.attrs.clear() + return soup + def html_to_markdown(page_source: str) -> str: - return pypandoc.convert_text(page_source, 'markdown', format='html') + soup = BeautifulSoup(page_source, "html.parser") + soup = slimdown_html(soup) + page_source = str(soup) + + md = pypandoc.convert_text(page_source, "markdown", format="html") + + md = re.sub(r"", " ", md) + md = re.sub(r"
", " ", md) + + md = re.sub(r"\n\s*\n", "\n\n", md) + + return md + def main(url): scraper = Scraper() diff --git a/requirements.in b/requirements.in index 200fc442e..37137af2a 100644 --- a/requirements.in +++ b/requirements.in @@ -23,3 +23,5 @@ PyYAML Pillow diff-match-patch playwright +pypandoc +pypandoc_binary diff --git a/requirements.txt b/requirements.txt index 52545514f..201eb1a0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -96,6 +96,10 @@ pyee==11.0.1 # via playwright pygments==2.17.2 # via rich +pypandoc==1.12 + # via -r requirements.in +pypandoc-binary==1.12 + # via -r requirements.in pyyaml==6.0.1 # via -r requirements.in referencing==0.32.0