added [playwright] extra

This commit is contained in:
Paul Gauthier 2024-07-14 19:34:48 +01:00
parent bc1369c480
commit 4fbe3d295a
12 changed files with 91 additions and 140 deletions

View file

@ -9,7 +9,7 @@ import git
from aider import models, prompts, voice
from aider.help import Help, install_help_extra
from aider.llm import litellm
from aider.scrape import Scraper
from aider.scrape import Scraper, install_playwright
from aider.utils import is_image_file
from .dump import dump # noqa: F401
@ -65,17 +65,17 @@ class Commands:
return
if not self.scraper:
self.scraper = Scraper(print_error=self.io.tool_error)
res = install_playwright(self.io)
if not res:
self.io.tool_error("Unable to initialize playwright.")
self.scraper = Scraper(print_error=self.io.tool_error, playwright_available=res)
content = self.scraper.scrape(url) or ""
# if content:
# self.io.tool_output(content)
instructions = self.scraper.get_playwright_instructions()
if instructions:
self.io.tool_error(instructions)
content = f"{url}:\n\n" + content # noqa: E231
content = f"{url}:\n\n" + content
return content

View file

@ -12,14 +12,59 @@ aider_user_agent = f"Aider/{__version__} +{urls.website}"
# Playwright is nice because it has a simple way to install dependencies on most
# platforms.
PLAYWRIGHT_INFO = """
For better web scraping, install Playwright chromium:
def install_playwright(io):
try:
from playwright.sync_api import sync_playwright
has_pip = True
except ImportError:
has_pip = False
try:
with sync_playwright() as p:
p.chromium.launch()
has_chromium = True
except Exception as err:
dump(err)
has_chromium = False
if has_pip and has_chromium:
return True
pip_cmd = utils.get_pip_install(["aider-chat[playwright]"])
chromium_cmd = "playwright install --with-deps chromium".split()
cmds = ""
if not has_pip:
cmds += " ".join(pip_cmd) + "\n"
if not has_chromium:
cmds += " ".join(chromium_cmd) + "\n"
text = f"""For the best web scraping, install Playwright:
{cmds}
See {urls.enable_playwright} for more info.
"""
io.tool_error(text)
if not io.confirm_ask("Install playwright?", default="y"):
return
if not has_pip:
success, output = utils.run_install(pip_cmd)
if not success:
io.tool_error(output)
return
success, output = utils.run_install(chromium_cmd)
if not success:
io.tool_error(output)
return
return True
class Scraper:
pandoc_available = None
@ -27,7 +72,7 @@ class Scraper:
playwright_instructions_shown = False
# Public API...
def __init__(self, print_error=None):
def __init__(self, print_error=None, playwright_available=None):
"""
`print_error` - a function to call to print error/debug info.
"""
@ -36,13 +81,14 @@ class Scraper:
else:
self.print_error = print
self.playwright_available = playwright_available
def scrape(self, url):
"""
Scrape a url and turn it into readable markdown.
`url` - the URLto scrape.
"""
self.try_playwright()
if self.playwright_available:
content = self.scrape_with_playwright(url)
@ -88,46 +134,8 @@ class Scraper:
return content
def try_playwright(self):
if self.playwright_available is not None:
return
try:
from playwright.sync_api import sync_playwright
has_pip = True
except ImportError:
has_pip = False
try:
with sync_playwright() as p:
p.chromium.launch()
has_chromium = True
except Exception:
has_chromium = False
if has_pip and has_chromium:
self.playwright_available = True
pip_cmd = utils.get_pip_cmd("playwright")
chromium_cmd = "playwright install --with-deps chromium".split()
cmds = ""
if not has_pip:
cmds += " ".join(pip_cmd) + "\n"
if not has_chromium:
cmds += " ".join(chromium_cmd) + "\n"
text = PLAYWRIGHT_INFO.format(cmds=cmds)
def get_playwright_instructions(self):
if self.playwright_available in (True, None):
return
if self.playwright_instructions_shown:
return
self.playwright_instructions_shown = True
return PLAYWRIGHT_INFO
return
def scrape_with_httpx(self, url):
import httpx

View file

@ -224,18 +224,21 @@ def run_install(cmd):
last_update = current_time
return_code = process.wait()
output = "".join(output)
dump(output)
if return_code == 0:
print("\rInstallation complete.")
print()
return True, ''.join(output)
return True, output
except subprocess.CalledProcessError as e:
print(f"\nError running pip install: {e}")
print("\nInstallation failed.\n")
return False, ''.join(output)
return False, output
def check_pip_install_extra(io, module, prompt, pip_install_cmd):
@ -263,8 +266,7 @@ def check_pip_install_extra(io, module, prompt, pip_install_cmd):
except (ImportError, ModuleNotFoundError):
pass
for line in output:
print(line)
io.tool_error(output)
print()
print(f"Failed to install {pip_install_cmd[0]}")