From 33a62c8dc64777d830d76b5ed3f4df9e133fe4fa Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 8 Feb 2024 12:10:12 -0800 Subject: [PATCH] aider: finish `slimdown_html`. remove all tags and all per-element css-styles and all internal anchor elements # Aider chat conversation: USER: finish `slimdown_html`. remove all tags and all per-element css-styles and all internal anchor elements ASSISTANT: ```diff --- aider/scrape.py +++ aider/scrape.py @@ ... @@ -def slimdown_html(page_source: str) -> str: - soup = BeautifulSoup(page_source, "html.parser") - # ... +def slimdown_html(page_source: str) -> str: + soup = BeautifulSoup(page_source, "html.parser") + # Remove all tags + for img in soup.find_all('img'): + img.decompose() + # Remove all per-element CSS styles + for tag in soup.find_all(True): + tag.attrs.pop('style', None) + # Remove all internal anchor elements + for anchor in soup.find_all('a', href=True): + if anchor['href'].startswith('#'): + anchor.decompose() + return str(soup) ``` --- aider/scrape.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/aider/scrape.py b/aider/scrape.py index 60c292a1f..c2a856bbf 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -121,7 +121,17 @@ def html_to_text(page_source: str) -> str: def slimdown_html(page_source: str) -> str: soup = BeautifulSoup(page_source, "html.parser") - # ... + # Remove all tags + for img in soup.find_all('img'): + img.decompose() + # Remove all per-element CSS styles + for tag in soup.find_all(True): + tag.attrs.pop('style', None) + # Remove all internal anchor elements + for anchor in soup.find_all('a', href=True): + if anchor['href'].startswith('#'): + anchor.decompose() + return str(soup) def html_to_markdown(page_source: str) -> str: return pypandoc.convert_text(page_source, 'markdown', format='html')