mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 10:14:59 +00:00
Add functionality to write scraped content to a temporary file and optimize HTML parsing.
This commit is contained in:
parent
33a62c8dc6
commit
84ae51c005
1 changed files with 7 additions and 3 deletions
|
@ -94,6 +94,8 @@ class Scraper:
|
||||||
else:
|
else:
|
||||||
content = self.scrape_with_httpx(url)
|
content = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
|
Path('tmp.html').write_text(content)
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
content = html_to_markdown(content)
|
content = html_to_markdown(content)
|
||||||
#content = html_to_text(content)
|
#content = html_to_text(content)
|
||||||
|
@ -107,8 +109,11 @@ class Scraper:
|
||||||
|
|
||||||
|
|
||||||
def html_to_text(page_source: str) -> str:
|
def html_to_text(page_source: str) -> str:
|
||||||
|
|
||||||
soup = BeautifulSoup(page_source, "html.parser")
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
|
||||||
|
soup = slimdown_html(soup)
|
||||||
|
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
||||||
|
@ -119,8 +124,7 @@ def html_to_text(page_source: str) -> str:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def slimdown_html(page_source: str) -> str:
|
def slimdown_html(soup):
|
||||||
soup = BeautifulSoup(page_source, "html.parser")
|
|
||||||
# Remove all <img> tags
|
# Remove all <img> tags
|
||||||
for img in soup.find_all('img'):
|
for img in soup.find_all('img'):
|
||||||
img.decompose()
|
img.decompose()
|
||||||
|
@ -131,7 +135,7 @@ def slimdown_html(page_source: str) -> str:
|
||||||
for anchor in soup.find_all('a', href=True):
|
for anchor in soup.find_all('a', href=True):
|
||||||
if anchor['href'].startswith('#'):
|
if anchor['href'].startswith('#'):
|
||||||
anchor.decompose()
|
anchor.decompose()
|
||||||
return str(soup)
|
return soup
|
||||||
|
|
||||||
def html_to_markdown(page_source: str) -> str:
|
def html_to_markdown(page_source: str) -> str:
|
||||||
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue