mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-02 18:54:59 +00:00
output with pandoc, cleanup with bs and re
This commit is contained in:
parent
be60b785a4
commit
cc36329691
3 changed files with 35 additions and 20 deletions
|
@ -1,9 +1,10 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import pypandoc
|
|
||||||
import httpx
|
import httpx
|
||||||
|
import pypandoc
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
@ -94,11 +95,9 @@ class Scraper:
|
||||||
else:
|
else:
|
||||||
content = self.scrape_with_httpx(url)
|
content = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
Path('tmp.html').write_text(content)
|
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
content = html_to_markdown(content)
|
content = html_to_markdown(content)
|
||||||
#content = html_to_text(content)
|
# content = html_to_text(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
@ -109,11 +108,8 @@ class Scraper:
|
||||||
|
|
||||||
|
|
||||||
def html_to_text(page_source: str) -> str:
|
def html_to_text(page_source: str) -> str:
|
||||||
|
|
||||||
soup = BeautifulSoup(page_source, "html.parser")
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
|
||||||
soup = slimdown_html(soup)
|
|
||||||
|
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
||||||
|
@ -125,25 +121,38 @@ def html_to_text(page_source: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def slimdown_html(soup):
|
def slimdown_html(soup):
|
||||||
# Remove all <img> tags
|
for svg in soup.find_all("svg"):
|
||||||
for img in soup.find_all('img'):
|
svg.decompose()
|
||||||
img.decompose()
|
|
||||||
# Remove all elements with data: URLs
|
if soup.img:
|
||||||
for tag in soup.find_all(href=lambda x: x and x.startswith('data:')):
|
soup.img.decompose()
|
||||||
|
|
||||||
|
for tag in soup.find_all(href=lambda x: x and x.startswith("data:")):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
for tag in soup.find_all(src=lambda x: x and x.startswith('data:')):
|
|
||||||
|
for tag in soup.find_all(src=lambda x: x and x.startswith("data:")):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
# Remove all per-element CSS styles
|
|
||||||
for tag in soup.find_all(True):
|
for tag in soup.find_all(True):
|
||||||
tag.attrs.pop('style', None)
|
tag.attrs.clear()
|
||||||
# Remove all internal anchor elements
|
|
||||||
for anchor in soup.find_all('a', href=True):
|
|
||||||
if anchor['href'].startswith('#'):
|
|
||||||
anchor.decompose()
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
def html_to_markdown(page_source: str) -> str:
|
def html_to_markdown(page_source: str) -> str:
|
||||||
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
soup = slimdown_html(soup)
|
||||||
|
page_source = str(soup)
|
||||||
|
|
||||||
|
md = pypandoc.convert_text(page_source, "markdown", format="html")
|
||||||
|
|
||||||
|
md = re.sub(r"</div>", " ", md)
|
||||||
|
md = re.sub(r"<div>", " ", md)
|
||||||
|
|
||||||
|
md = re.sub(r"\n\s*\n", "\n\n", md)
|
||||||
|
|
||||||
|
return md
|
||||||
|
|
||||||
|
|
||||||
def main(url):
|
def main(url):
|
||||||
scraper = Scraper()
|
scraper = Scraper()
|
||||||
|
|
|
@ -23,3 +23,5 @@ PyYAML
|
||||||
Pillow
|
Pillow
|
||||||
diff-match-patch
|
diff-match-patch
|
||||||
playwright
|
playwright
|
||||||
|
pypandoc
|
||||||
|
pypandoc_binary
|
||||||
|
|
|
@ -96,6 +96,10 @@ pyee==11.0.1
|
||||||
# via playwright
|
# via playwright
|
||||||
pygments==2.17.2
|
pygments==2.17.2
|
||||||
# via rich
|
# via rich
|
||||||
|
pypandoc==1.12
|
||||||
|
# via -r requirements.in
|
||||||
|
pypandoc-binary==1.12
|
||||||
|
# via -r requirements.in
|
||||||
pyyaml==6.0.1
|
pyyaml==6.0.1
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
referencing==0.32.0
|
referencing==0.32.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue