mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 17:55:01 +00:00
feat: Modify scrape method to only convert HTML to markdown
This commit is contained in:
parent
f896d93b28
commit
c0982af02c
1 changed files with 7 additions and 5 deletions
|
@ -87,9 +87,10 @@ class Scraper:
|
||||||
|
|
||||||
def scrape(self, url):
|
def scrape(self, url):
|
||||||
"""
|
"""
|
||||||
Scrape a url and turn it into readable markdown.
|
Scrape a url and turn it into readable markdown if it's HTML.
|
||||||
|
If it's plain text or non-HTML, return it as-is.
|
||||||
|
|
||||||
`url` - the URLto scrape.
|
`url` - the URL to scrape.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.playwright_available:
|
if self.playwright_available:
|
||||||
|
@ -101,9 +102,10 @@ class Scraper:
|
||||||
self.print_error(f"Failed to retrieve content from {url}")
|
self.print_error(f"Failed to retrieve content from {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.try_pandoc()
|
# Check if the content is HTML
|
||||||
|
if content.strip().startswith(('<html', '<!DOCTYPE html')):
|
||||||
content = self.html_to_markdown(content)
|
self.try_pandoc()
|
||||||
|
content = self.html_to_markdown(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue