added [playwright] extra

This commit is contained in:
Paul Gauthier 2024-07-14 19:34:48 +01:00
parent bc1369c480
commit 4fbe3d295a
12 changed files with 91 additions and 140 deletions

View file

@ -9,7 +9,7 @@ import git
from aider import models, prompts, voice from aider import models, prompts, voice
from aider.help import Help, install_help_extra from aider.help import Help, install_help_extra
from aider.llm import litellm from aider.llm import litellm
from aider.scrape import Scraper from aider.scrape import Scraper, install_playwright
from aider.utils import is_image_file from aider.utils import is_image_file
from .dump import dump # noqa: F401 from .dump import dump # noqa: F401
@ -65,17 +65,17 @@ class Commands:
return return
if not self.scraper: if not self.scraper:
self.scraper = Scraper(print_error=self.io.tool_error) res = install_playwright(self.io)
if not res:
self.io.tool_error("Unable to initialize playwright.")
self.scraper = Scraper(print_error=self.io.tool_error, playwright_available=res)
content = self.scraper.scrape(url) or "" content = self.scraper.scrape(url) or ""
# if content: # if content:
# self.io.tool_output(content) # self.io.tool_output(content)
instructions = self.scraper.get_playwright_instructions() content = f"{url}:\n\n" + content
if instructions:
self.io.tool_error(instructions)
content = f"{url}:\n\n" + content # noqa: E231
return content return content

View file

@ -12,14 +12,59 @@ aider_user_agent = f"Aider/{__version__} +{urls.website}"
# Playwright is nice because it has a simple way to install dependencies on most # Playwright is nice because it has a simple way to install dependencies on most
# platforms. # platforms.
PLAYWRIGHT_INFO = """
For better web scraping, install Playwright chromium:
def install_playwright(io):
try:
from playwright.sync_api import sync_playwright
has_pip = True
except ImportError:
has_pip = False
try:
with sync_playwright() as p:
p.chromium.launch()
has_chromium = True
except Exception as err:
dump(err)
has_chromium = False
if has_pip and has_chromium:
return True
pip_cmd = utils.get_pip_install(["aider-chat[playwright]"])
chromium_cmd = "playwright install --with-deps chromium".split()
cmds = ""
if not has_pip:
cmds += " ".join(pip_cmd) + "\n"
if not has_chromium:
cmds += " ".join(chromium_cmd) + "\n"
text = f"""For the best web scraping, install Playwright:
{cmds} {cmds}
See {urls.enable_playwright} for more info. See {urls.enable_playwright} for more info.
""" """
io.tool_error(text)
if not io.confirm_ask("Install playwright?", default="y"):
return
if not has_pip:
success, output = utils.run_install(pip_cmd)
if not success:
io.tool_error(output)
return
success, output = utils.run_install(chromium_cmd)
if not success:
io.tool_error(output)
return
return True
class Scraper: class Scraper:
pandoc_available = None pandoc_available = None
@ -27,7 +72,7 @@ class Scraper:
playwright_instructions_shown = False playwright_instructions_shown = False
# Public API... # Public API...
def __init__(self, print_error=None): def __init__(self, print_error=None, playwright_available=None):
""" """
`print_error` - a function to call to print error/debug info. `print_error` - a function to call to print error/debug info.
""" """
@ -36,13 +81,14 @@ class Scraper:
else: else:
self.print_error = print self.print_error = print
self.playwright_available = playwright_available
def scrape(self, url): def scrape(self, url):
""" """
Scrape a url and turn it into readable markdown. Scrape a url and turn it into readable markdown.
`url` - the URLto scrape. `url` - the URLto scrape.
""" """
self.try_playwright()
if self.playwright_available: if self.playwright_available:
content = self.scrape_with_playwright(url) content = self.scrape_with_playwright(url)
@ -88,46 +134,8 @@ class Scraper:
return content return content
def try_playwright(self):
if self.playwright_available is not None:
return
try:
from playwright.sync_api import sync_playwright
has_pip = True
except ImportError:
has_pip = False
try:
with sync_playwright() as p:
p.chromium.launch()
has_chromium = True
except Exception:
has_chromium = False
if has_pip and has_chromium:
self.playwright_available = True
pip_cmd = utils.get_pip_cmd("playwright")
chromium_cmd = "playwright install --with-deps chromium".split()
cmds = ""
if not has_pip:
cmds += " ".join(pip_cmd) + "\n"
if not has_chromium:
cmds += " ".join(chromium_cmd) + "\n"
text = PLAYWRIGHT_INFO.format(cmds=cmds)
def get_playwright_instructions(self): def get_playwright_instructions(self):
if self.playwright_available in (True, None): return
return
if self.playwright_instructions_shown:
return
self.playwright_instructions_shown = True
return PLAYWRIGHT_INFO
def scrape_with_httpx(self, url): def scrape_with_httpx(self, url):
import httpx import httpx

View file

@ -224,18 +224,21 @@ def run_install(cmd):
last_update = current_time last_update = current_time
return_code = process.wait() return_code = process.wait()
output = "".join(output)
dump(output)
if return_code == 0: if return_code == 0:
print("\rInstallation complete.") print("\rInstallation complete.")
print() print()
return True, ''.join(output) return True, output
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"\nError running pip install: {e}") print(f"\nError running pip install: {e}")
print("\nInstallation failed.\n") print("\nInstallation failed.\n")
return False, ''.join(output) return False, output
def check_pip_install_extra(io, module, prompt, pip_install_cmd): def check_pip_install_extra(io, module, prompt, pip_install_cmd):
@ -263,8 +266,7 @@ def check_pip_install_extra(io, module, prompt, pip_install_cmd):
except (ImportError, ModuleNotFoundError): except (ImportError, ModuleNotFoundError):
pass pass
for line in output: io.tool_error(output)
print(line)
print() print()
print(f"Failed to install {pip_install_cmd[0]}") print(f"Failed to install {pip_install_cmd[0]}")

View file

@ -23,8 +23,6 @@ backoff==2.2.1
# via -r requirements/requirements.in # via -r requirements/requirements.in
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
# via -r requirements/requirements.in # via -r requirements/requirements.in
cachetools==5.3.3
# via google-auth
certifi==2024.7.4 certifi==2024.7.4
# via # via
# httpcore # httpcore
@ -60,48 +58,12 @@ gitdb==4.0.11
# via gitpython # via gitpython
gitpython==3.1.43 gitpython==3.1.43
# via -r requirements/requirements.in # via -r requirements/requirements.in
google-ai-generativelanguage==0.6.6
# via google-generativeai
google-api-core[grpc]==2.19.1
# via
# google-ai-generativelanguage
# google-api-python-client
# google-generativeai
google-api-python-client==2.136.0
# via google-generativeai
google-auth==2.31.0
# via
# google-ai-generativelanguage
# google-api-core
# google-api-python-client
# google-auth-httplib2
# google-generativeai
google-auth-httplib2==0.2.0
# via google-api-python-client
google-generativeai==0.7.1
# via -r requirements/requirements.in
googleapis-common-protos==1.63.2
# via
# google-api-core
# grpcio-status
greenlet==3.0.3
# via playwright
grep-ast==0.3.2 grep-ast==0.3.2
# via -r requirements/requirements.in # via -r requirements/requirements.in
grpcio==1.64.1
# via
# google-api-core
# grpcio-status
grpcio-status==1.62.2
# via google-api-core
h11==0.14.0 h11==0.14.0
# via httpcore # via httpcore
httpcore==1.0.5 httpcore==1.0.5
# via httpx # via httpx
httplib2==0.22.0
# via
# google-api-python-client
# google-auth-httplib2
httpx==0.27.0 httpx==0.27.0
# via openai # via openai
huggingface-hub==0.23.4 huggingface-hub==0.23.4
@ -158,49 +120,24 @@ pathspec==0.12.1
# grep-ast # grep-ast
pillow==10.4.0 pillow==10.4.0
# via -r requirements/requirements.in # via -r requirements/requirements.in
playwright==1.45.0
# via -r requirements/requirements.in
prompt-toolkit==3.0.47 prompt-toolkit==3.0.47
# via -r requirements/requirements.in # via -r requirements/requirements.in
proto-plus==1.24.0
# via
# google-ai-generativelanguage
# google-api-core
protobuf==4.25.3
# via
# google-ai-generativelanguage
# google-api-core
# google-generativeai
# googleapis-common-protos
# grpcio-status
# proto-plus
pyasn1==0.6.0
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.4.0
# via google-auth
pycodestyle==2.12.0 pycodestyle==2.12.0
# via flake8 # via flake8
pycparser==2.22 pycparser==2.22
# via cffi # via cffi
pydantic==2.8.2 pydantic==2.8.2
# via # via
# google-generativeai
# litellm # litellm
# openai # openai
pydantic-core==2.20.1 pydantic-core==2.20.1
# via pydantic # via pydantic
pyee==11.1.0
# via playwright
pyflakes==3.2.0 pyflakes==3.2.0
# via flake8 # via flake8
pygments==2.18.0 pygments==2.18.0
# via rich # via rich
pypandoc==1.13 pypandoc==1.13
# via -r requirements/requirements.in # via -r requirements/requirements.in
pyparsing==3.1.2
# via httplib2
python-dotenv==1.0.1 python-dotenv==1.0.1
# via litellm # via litellm
pyyaml==6.0.1 pyyaml==6.0.1
@ -215,7 +152,6 @@ regex==2024.5.15
# via tiktoken # via tiktoken
requests==2.32.3 requests==2.32.3
# via # via
# google-api-core
# huggingface-hub # huggingface-hub
# litellm # litellm
# tiktoken # tiktoken
@ -225,8 +161,6 @@ rpds-py==0.18.1
# via # via
# jsonschema # jsonschema
# referencing # referencing
rsa==4.9
# via google-auth
scipy==1.13.1 scipy==1.13.1
# via -r requirements/requirements.in # via -r requirements/requirements.in
smmap==5.0.1 smmap==5.0.1
@ -248,7 +182,6 @@ tokenizers==0.19.1
# via litellm # via litellm
tqdm==4.66.4 tqdm==4.66.4
# via # via
# google-generativeai
# huggingface-hub # huggingface-hub
# openai # openai
tree-sitter==0.21.3 tree-sitter==0.21.3
@ -259,14 +192,10 @@ tree-sitter-languages==1.10.2
# via grep-ast # via grep-ast
typing-extensions==4.12.2 typing-extensions==4.12.2
# via # via
# google-generativeai
# huggingface-hub # huggingface-hub
# openai # openai
# pydantic # pydantic
# pydantic-core # pydantic-core
# pyee
uritemplate==4.1.1
# via google-api-python-client
urllib3==2.2.2 urllib3==2.2.2
# via requests # via requests
wcwidth==0.2.13 wcwidth==0.2.13

View file

@ -14,9 +14,7 @@ attrs==23.2.0
blinker==1.8.2 blinker==1.8.2
# via streamlit # via streamlit
cachetools==5.3.3 cachetools==5.3.3
# via # via streamlit
# -c requirements/../requirements.txt
# streamlit
certifi==2024.7.4 certifi==2024.7.4
# via # via
# -c requirements/../requirements.txt # -c requirements/../requirements.txt
@ -88,9 +86,7 @@ pillow==10.4.0
# -c requirements/../requirements.txt # -c requirements/../requirements.txt
# streamlit # streamlit
protobuf==4.25.3 protobuf==4.25.3
# via # via streamlit
# -c requirements/../requirements.txt
# streamlit
pyarrow==16.1.0 pyarrow==16.1.0
# via streamlit # via streamlit
pydeck==0.9.1 pydeck==0.9.1

View file

@ -124,9 +124,7 @@ pygments==2.18.0
# rich # rich
# sphinx # sphinx
pyparsing==3.1.2 pyparsing==3.1.2
# via # via matplotlib
# -c requirements/../requirements.txt
# matplotlib
pyproject-hooks==1.1.0 pyproject-hooks==1.1.0
# via # via
# build # build

View file

@ -68,9 +68,7 @@ fsspec==2024.6.1
# llama-index-core # llama-index-core
# torch # torch
greenlet==3.0.3 greenlet==3.0.3
# via # via sqlalchemy
# -c requirements/../requirements.txt
# sqlalchemy
h11==0.14.0 h11==0.14.0
# via # via
# -c requirements/../requirements.txt # -c requirements/../requirements.txt

View file

@ -0,0 +1,3 @@
-c ../requirements.txt
playwright

View file

@ -0,0 +1,16 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile --output-file=requirements/requirements-playwright.txt requirements/requirements-playwright.in
#
greenlet==3.0.3
# via playwright
playwright==1.45.0
# via -r requirements/requirements-playwright.in
pyee==11.1.0
# via playwright
typing-extensions==4.12.2
# via
# -c requirements/../requirements.txt
# pyee

View file

@ -19,7 +19,6 @@ beautifulsoup4
PyYAML PyYAML
Pillow Pillow
diff-match-patch diff-match-patch
playwright
pypandoc pypandoc
litellm litellm
flake8 flake8

View file

@ -8,7 +8,7 @@ pip-compile \
--output-file=requirements.txt \ --output-file=requirements.txt \
$1 $1
for SUFFIX in dev hf-embed browser; do for SUFFIX in dev hf-embed browser playwright; do
pip-compile \ pip-compile \
requirements/requirements-${SUFFIX}.in \ requirements/requirements-${SUFFIX}.in \

View file

@ -23,6 +23,7 @@ requirements = get_requirements()
dev_requirements = get_requirements("dev") dev_requirements = get_requirements("dev")
hf_requirements = get_requirements("hf-embed") hf_requirements = get_requirements("hf-embed")
browser_requirements = get_requirements("browser") browser_requirements = get_requirements("browser")
playwright_requirements = get_requirements("playwright")
# README # README
with open("README.md", "r", encoding="utf-8") as f: with open("README.md", "r", encoding="utf-8") as f:
@ -49,6 +50,7 @@ setup(
"dev": dev_requirements, "dev": dev_requirements,
"hf-embed": hf_requirements, "hf-embed": hf_requirements,
"browser": browser_requirements, "browser": browser_requirements,
"playwright": playwright_requirements,
}, },
python_requires=">=3.9,<3.13", python_requires=">=3.9,<3.13",
entry_points={ entry_points={