mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 01:35:00 +00:00
Add functionality to write scraped content to a temporary file and optimize HTML parsing.
This commit is contained in:
parent
33a62c8dc6
commit
84ae51c005
1 changed files with 7 additions and 3 deletions
|
@ -94,6 +94,8 @@ class Scraper:
|
|||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
Path('tmp.html').write_text(content)
|
||||
|
||||
if content:
|
||||
content = html_to_markdown(content)
|
||||
#content = html_to_text(content)
|
||||
|
@ -107,8 +109,11 @@ class Scraper:
|
|||
|
||||
|
||||
def html_to_text(page_source: str) -> str:
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
soup = slimdown_html(soup)
|
||||
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
|
@ -119,8 +124,7 @@ def html_to_text(page_source: str) -> str:
|
|||
return text
|
||||
|
||||
|
||||
def slimdown_html(page_source: str) -> str:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
def slimdown_html(soup):
|
||||
# Remove all <img> tags
|
||||
for img in soup.find_all('img'):
|
||||
img.decompose()
|
||||
|
@ -131,7 +135,7 @@ def slimdown_html(page_source: str) -> str:
|
|||
for anchor in soup.find_all('a', href=True):
|
||||
if anchor['href'].startswith('#'):
|
||||
anchor.decompose()
|
||||
return str(soup)
|
||||
return soup
|
||||
|
||||
def html_to_markdown(page_source: str) -> str:
|
||||
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue