mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 10:14:59 +00:00
aider: finish slimdown_html
. remove all <img> tags and all per-element css-styles and all internal anchor elements
# Aider chat conversation: USER: finish `slimdown_html`. remove all <img> tags and all per-element css-styles and all internal anchor elements ASSISTANT: ```diff --- aider/scrape.py +++ aider/scrape.py @@ ... @@ -def slimdown_html(page_source: str) -> str: - soup = BeautifulSoup(page_source, "html.parser") - # ... +def slimdown_html(page_source: str) -> str: + soup = BeautifulSoup(page_source, "html.parser") + # Remove all <img> tags + for img in soup.find_all('img'): + img.decompose() + # Remove all per-element CSS styles + for tag in soup.find_all(True): + tag.attrs.pop('style', None) + # Remove all internal anchor elements + for anchor in soup.find_all('a', href=True): + if anchor['href'].startswith('#'): + anchor.decompose() + return str(soup) ```
This commit is contained in:
parent
1520bb976a
commit
33a62c8dc6
1 changed files with 11 additions and 1 deletions
|
@ -121,7 +121,17 @@ def html_to_text(page_source: str) -> str:
|
|||
|
||||
def slimdown_html(page_source: str) -> str:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
# ...
|
||||
# Remove all <img> tags
|
||||
for img in soup.find_all('img'):
|
||||
img.decompose()
|
||||
# Remove all per-element CSS styles
|
||||
for tag in soup.find_all(True):
|
||||
tag.attrs.pop('style', None)
|
||||
# Remove all internal anchor elements
|
||||
for anchor in soup.find_all('a', href=True):
|
||||
if anchor['href'].startswith('#'):
|
||||
anchor.decompose()
|
||||
return str(soup)
|
||||
|
||||
def html_to_markdown(page_source: str) -> str:
|
||||
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue