mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-28 08:14:59 +00:00
added docstrings
This commit is contained in:
parent
e922732f0a
commit
b8313c5343
1 changed files with 30 additions and 18 deletions
|
@ -12,6 +12,8 @@ from aider import __version__
|
|||
|
||||
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
|
||||
|
||||
# Playwright is nice because it has a simple way to install dependencies on most
|
||||
# platforms.
|
||||
PLAYWRIGHT_INFO = """
|
||||
For better web scraping, install Playwright chromium with this command in your terminal:
|
||||
|
||||
|
@ -26,12 +28,40 @@ class Scraper:
|
|||
playwright_available = None
|
||||
playwright_instructions_shown = False
|
||||
|
||||
# Public API...
|
||||
def __init__(self, print_error=None):
|
||||
"""
|
||||
`print_error` - a function to call to print error/debug info.
|
||||
"""
|
||||
if print_error:
|
||||
self.print_error = print_error
|
||||
else:
|
||||
self.print_error = print
|
||||
|
||||
def scrape(self, url):
|
||||
"""
|
||||
Scrape a url and turn it into readable markdown.
|
||||
|
||||
`url` - the URLto scrape.
|
||||
"""
|
||||
self.try_playwright()
|
||||
|
||||
if self.playwright_available:
|
||||
content = self.scrape_with_playwright(url)
|
||||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
if not content:
|
||||
return
|
||||
|
||||
self.try_pandoc()
|
||||
|
||||
content = self.html_to_markdown(content)
|
||||
# content = html_to_text(content)
|
||||
|
||||
return content
|
||||
|
||||
# Internals...
|
||||
def scrape_with_playwright(self, url):
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
|
@ -88,24 +118,6 @@ class Scraper:
|
|||
self.print_error(f"An error occurred: {err}")
|
||||
return None
|
||||
|
||||
def scrape(self, url):
|
||||
self.try_playwright()
|
||||
|
||||
if self.playwright_available:
|
||||
content = self.scrape_with_playwright(url)
|
||||
else:
|
||||
content = self.scrape_with_httpx(url)
|
||||
|
||||
if not content:
|
||||
return
|
||||
|
||||
self.try_pandoc()
|
||||
|
||||
content = self.html_to_markdown(content)
|
||||
# content = html_to_text(content)
|
||||
|
||||
return content
|
||||
|
||||
def try_pandoc(self):
|
||||
if self.pandoc_available:
|
||||
return
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue