mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 09:44:59 +00:00
added docstrings
This commit is contained in:
parent
e922732f0a
commit
b8313c5343
1 changed files with 30 additions and 18 deletions
|
@ -12,6 +12,8 @@ from aider import __version__
|
||||||
|
|
||||||
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
|
aider_user_agent = f"Aider/{__version__} +https://aider.chat"
|
||||||
|
|
||||||
|
# Playwright is nice because it has a simple way to install dependencies on most
|
||||||
|
# platforms.
|
||||||
PLAYWRIGHT_INFO = """
|
PLAYWRIGHT_INFO = """
|
||||||
For better web scraping, install Playwright chromium with this command in your terminal:
|
For better web scraping, install Playwright chromium with this command in your terminal:
|
||||||
|
|
||||||
|
@ -26,12 +28,40 @@ class Scraper:
|
||||||
playwright_available = None
|
playwright_available = None
|
||||||
playwright_instructions_shown = False
|
playwright_instructions_shown = False
|
||||||
|
|
||||||
|
# Public API...
|
||||||
def __init__(self, print_error=None):
|
def __init__(self, print_error=None):
|
||||||
|
"""
|
||||||
|
`print_error` - a function to call to print error/debug info.
|
||||||
|
"""
|
||||||
if print_error:
|
if print_error:
|
||||||
self.print_error = print_error
|
self.print_error = print_error
|
||||||
else:
|
else:
|
||||||
self.print_error = print
|
self.print_error = print
|
||||||
|
|
||||||
|
def scrape(self, url):
|
||||||
|
"""
|
||||||
|
Scrape a url and turn it into readable markdown.
|
||||||
|
|
||||||
|
`url` - the URLto scrape.
|
||||||
|
"""
|
||||||
|
self.try_playwright()
|
||||||
|
|
||||||
|
if self.playwright_available:
|
||||||
|
content = self.scrape_with_playwright(url)
|
||||||
|
else:
|
||||||
|
content = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.try_pandoc()
|
||||||
|
|
||||||
|
content = self.html_to_markdown(content)
|
||||||
|
# content = html_to_text(content)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
# Internals...
|
||||||
def scrape_with_playwright(self, url):
|
def scrape_with_playwright(self, url):
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
try:
|
try:
|
||||||
|
@ -88,24 +118,6 @@ class Scraper:
|
||||||
self.print_error(f"An error occurred: {err}")
|
self.print_error(f"An error occurred: {err}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def scrape(self, url):
|
|
||||||
self.try_playwright()
|
|
||||||
|
|
||||||
if self.playwright_available:
|
|
||||||
content = self.scrape_with_playwright(url)
|
|
||||||
else:
|
|
||||||
content = self.scrape_with_httpx(url)
|
|
||||||
|
|
||||||
if not content:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.try_pandoc()
|
|
||||||
|
|
||||||
content = self.html_to_markdown(content)
|
|
||||||
# content = html_to_text(content)
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
||||||
def try_pandoc(self):
|
def try_pandoc(self):
|
||||||
if self.pandoc_available:
|
if self.pandoc_available:
|
||||||
return
|
return
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue