mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-30 17:24:59 +00:00
Change html_to_text to html_to_markdown and add slimdown_html function.
This commit is contained in:
parent
1509e21698
commit
1520bb976a
1 changed files with 7 additions and 2 deletions
|
@ -95,7 +95,8 @@ class Scraper:
|
||||||
content = self.scrape_with_httpx(url)
|
content = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
content = html_to_text(content)
|
content = html_to_markdown(content)
|
||||||
|
#content = html_to_text(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
@ -118,8 +119,12 @@ def html_to_text(page_source: str) -> str:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def slimdown_html(page_source: str) -> str:
|
||||||
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
# ...
|
||||||
|
|
||||||
def html_to_markdown(page_source: str) -> str:
|
def html_to_markdown(page_source: str) -> str:
|
||||||
return pypandoc.convert_text(page_source, 'md', format='html')
|
return pypandoc.convert_text(page_source, 'markdown', format='html')
|
||||||
|
|
||||||
def main(url):
|
def main(url):
|
||||||
scraper = Scraper()
|
scraper = Scraper()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue