diff --git a/aider/commands.py b/aider/commands.py index d964733e9..702deaf70 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -9,7 +9,7 @@ import git from aider import models, prompts, voice from aider.help import Help, install_help_extra from aider.llm import litellm -from aider.scrape import Scraper +from aider.scrape import Scraper, install_playwright from aider.utils import is_image_file from .dump import dump # noqa: F401 @@ -65,17 +65,17 @@ class Commands: return if not self.scraper: - self.scraper = Scraper(print_error=self.io.tool_error) + res = install_playwright(self.io) + if not res: + self.io.tool_error("Unable to initialize playwright.") + + self.scraper = Scraper(print_error=self.io.tool_error, playwright_available=res) content = self.scraper.scrape(url) or "" # if content: # self.io.tool_output(content) - instructions = self.scraper.get_playwright_instructions() - if instructions: - self.io.tool_error(instructions) - - content = f"{url}:\n\n" + content # noqa: E231 + content = f"{url}:\n\n" + content return content diff --git a/aider/scrape.py b/aider/scrape.py index da935611e..7eed88da9 100755 --- a/aider/scrape.py +++ b/aider/scrape.py @@ -12,14 +12,59 @@ aider_user_agent = f"Aider/{__version__} +{urls.website}" # Playwright is nice because it has a simple way to install dependencies on most # platforms. -PLAYWRIGHT_INFO = """ -For better web scraping, install Playwright chromium: + + +def install_playwright(io): + try: + from playwright.sync_api import sync_playwright + + has_pip = True + except ImportError: + has_pip = False + + try: + with sync_playwright() as p: + p.chromium.launch() + has_chromium = True + except Exception as err: + dump(err) + has_chromium = False + + if has_pip and has_chromium: + return True + + pip_cmd = utils.get_pip_install(["aider-chat[playwright]"]) + chromium_cmd = "playwright install --with-deps chromium".split() + + cmds = "" + if not has_pip: + cmds += " ".join(pip_cmd) + "\n" + if not has_chromium: + cmds += " ".join(chromium_cmd) + "\n" + + text = f"""For the best web scraping, install Playwright: {cmds} - See {urls.enable_playwright} for more info. """ + io.tool_error(text) + if not io.confirm_ask("Install playwright?", default="y"): + return + + if not has_pip: + success, output = utils.run_install(pip_cmd) + if not success: + io.tool_error(output) + return + + success, output = utils.run_install(chromium_cmd) + if not success: + io.tool_error(output) + return + + return True + class Scraper: pandoc_available = None @@ -27,7 +72,7 @@ class Scraper: playwright_instructions_shown = False # Public API... - def __init__(self, print_error=None): + def __init__(self, print_error=None, playwright_available=None): """ `print_error` - a function to call to print error/debug info. """ @@ -36,13 +81,14 @@ class Scraper: else: self.print_error = print + self.playwright_available = playwright_available + def scrape(self, url): """ Scrape a url and turn it into readable markdown. `url` - the URLto scrape. """ - self.try_playwright() if self.playwright_available: content = self.scrape_with_playwright(url) @@ -88,46 +134,8 @@ class Scraper: return content - def try_playwright(self): - if self.playwright_available is not None: - return - - try: - from playwright.sync_api import sync_playwright - - has_pip = True - except ImportError: - has_pip = False - - try: - with sync_playwright() as p: - p.chromium.launch() - has_chromium = True - except Exception: - has_chromium = False - - if has_pip and has_chromium: - self.playwright_available = True - - pip_cmd = utils.get_pip_cmd("playwright") - chromium_cmd = "playwright install --with-deps chromium".split() - - cmds = "" - if not has_pip: - cmds += " ".join(pip_cmd) + "\n" - if not has_chromium: - cmds += " ".join(chromium_cmd) + "\n" - - text = PLAYWRIGHT_INFO.format(cmds=cmds) - def get_playwright_instructions(self): - if self.playwright_available in (True, None): - return - if self.playwright_instructions_shown: - return - - self.playwright_instructions_shown = True - return PLAYWRIGHT_INFO + return def scrape_with_httpx(self, url): import httpx diff --git a/aider/utils.py b/aider/utils.py index 9fccac6ee..5e1827d3c 100644 --- a/aider/utils.py +++ b/aider/utils.py @@ -224,18 +224,21 @@ def run_install(cmd): last_update = current_time return_code = process.wait() + output = "".join(output) + + dump(output) if return_code == 0: print("\rInstallation complete.") print() - return True, ''.join(output) + return True, output except subprocess.CalledProcessError as e: print(f"\nError running pip install: {e}") print("\nInstallation failed.\n") - return False, ''.join(output) + return False, output def check_pip_install_extra(io, module, prompt, pip_install_cmd): @@ -263,8 +266,7 @@ def check_pip_install_extra(io, module, prompt, pip_install_cmd): except (ImportError, ModuleNotFoundError): pass - for line in output: - print(line) + io.tool_error(output) print() print(f"Failed to install {pip_install_cmd[0]}") diff --git a/requirements.txt b/requirements.txt index 836b244c3..f096f7b6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,8 +23,6 @@ backoff==2.2.1 # via -r requirements/requirements.in beautifulsoup4==4.12.3 # via -r requirements/requirements.in -cachetools==5.3.3 - # via google-auth certifi==2024.7.4 # via # httpcore @@ -60,48 +58,12 @@ gitdb==4.0.11 # via gitpython gitpython==3.1.43 # via -r requirements/requirements.in -google-ai-generativelanguage==0.6.6 - # via google-generativeai -google-api-core[grpc]==2.19.1 - # via - # google-ai-generativelanguage - # google-api-python-client - # google-generativeai -google-api-python-client==2.136.0 - # via google-generativeai -google-auth==2.31.0 - # via - # google-ai-generativelanguage - # google-api-core - # google-api-python-client - # google-auth-httplib2 - # google-generativeai -google-auth-httplib2==0.2.0 - # via google-api-python-client -google-generativeai==0.7.1 - # via -r requirements/requirements.in -googleapis-common-protos==1.63.2 - # via - # google-api-core - # grpcio-status -greenlet==3.0.3 - # via playwright grep-ast==0.3.2 # via -r requirements/requirements.in -grpcio==1.64.1 - # via - # google-api-core - # grpcio-status -grpcio-status==1.62.2 - # via google-api-core h11==0.14.0 # via httpcore httpcore==1.0.5 # via httpx -httplib2==0.22.0 - # via - # google-api-python-client - # google-auth-httplib2 httpx==0.27.0 # via openai huggingface-hub==0.23.4 @@ -158,49 +120,24 @@ pathspec==0.12.1 # grep-ast pillow==10.4.0 # via -r requirements/requirements.in -playwright==1.45.0 - # via -r requirements/requirements.in prompt-toolkit==3.0.47 # via -r requirements/requirements.in -proto-plus==1.24.0 - # via - # google-ai-generativelanguage - # google-api-core -protobuf==4.25.3 - # via - # google-ai-generativelanguage - # google-api-core - # google-generativeai - # googleapis-common-protos - # grpcio-status - # proto-plus -pyasn1==0.6.0 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.0 - # via google-auth pycodestyle==2.12.0 # via flake8 pycparser==2.22 # via cffi pydantic==2.8.2 # via - # google-generativeai # litellm # openai pydantic-core==2.20.1 # via pydantic -pyee==11.1.0 - # via playwright pyflakes==3.2.0 # via flake8 pygments==2.18.0 # via rich pypandoc==1.13 # via -r requirements/requirements.in -pyparsing==3.1.2 - # via httplib2 python-dotenv==1.0.1 # via litellm pyyaml==6.0.1 @@ -215,7 +152,6 @@ regex==2024.5.15 # via tiktoken requests==2.32.3 # via - # google-api-core # huggingface-hub # litellm # tiktoken @@ -225,8 +161,6 @@ rpds-py==0.18.1 # via # jsonschema # referencing -rsa==4.9 - # via google-auth scipy==1.13.1 # via -r requirements/requirements.in smmap==5.0.1 @@ -248,7 +182,6 @@ tokenizers==0.19.1 # via litellm tqdm==4.66.4 # via - # google-generativeai # huggingface-hub # openai tree-sitter==0.21.3 @@ -259,14 +192,10 @@ tree-sitter-languages==1.10.2 # via grep-ast typing-extensions==4.12.2 # via - # google-generativeai # huggingface-hub # openai # pydantic # pydantic-core - # pyee -uritemplate==4.1.1 - # via google-api-python-client urllib3==2.2.2 # via requests wcwidth==0.2.13 diff --git a/requirements/requirements-browser.txt b/requirements/requirements-browser.txt index d6d63717b..6fc4ee300 100644 --- a/requirements/requirements-browser.txt +++ b/requirements/requirements-browser.txt @@ -14,9 +14,7 @@ attrs==23.2.0 blinker==1.8.2 # via streamlit cachetools==5.3.3 - # via - # -c requirements/../requirements.txt - # streamlit + # via streamlit certifi==2024.7.4 # via # -c requirements/../requirements.txt @@ -88,9 +86,7 @@ pillow==10.4.0 # -c requirements/../requirements.txt # streamlit protobuf==4.25.3 - # via - # -c requirements/../requirements.txt - # streamlit + # via streamlit pyarrow==16.1.0 # via streamlit pydeck==0.9.1 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 96e927a91..17002d43c 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -124,9 +124,7 @@ pygments==2.18.0 # rich # sphinx pyparsing==3.1.2 - # via - # -c requirements/../requirements.txt - # matplotlib + # via matplotlib pyproject-hooks==1.1.0 # via # build diff --git a/requirements/requirements-hf-embed.txt b/requirements/requirements-hf-embed.txt index f592ff8a1..1063714d0 100644 --- a/requirements/requirements-hf-embed.txt +++ b/requirements/requirements-hf-embed.txt @@ -68,9 +68,7 @@ fsspec==2024.6.1 # llama-index-core # torch greenlet==3.0.3 - # via - # -c requirements/../requirements.txt - # sqlalchemy + # via sqlalchemy h11==0.14.0 # via # -c requirements/../requirements.txt diff --git a/requirements/requirements-playwright.in b/requirements/requirements-playwright.in new file mode 100644 index 000000000..09c9ecee6 --- /dev/null +++ b/requirements/requirements-playwright.in @@ -0,0 +1,3 @@ +-c ../requirements.txt + +playwright diff --git a/requirements/requirements-playwright.txt b/requirements/requirements-playwright.txt new file mode 100644 index 000000000..5b3faa7b0 --- /dev/null +++ b/requirements/requirements-playwright.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements/requirements-playwright.txt requirements/requirements-playwright.in +# +greenlet==3.0.3 + # via playwright +playwright==1.45.0 + # via -r requirements/requirements-playwright.in +pyee==11.1.0 + # via playwright +typing-extensions==4.12.2 + # via + # -c requirements/../requirements.txt + # pyee diff --git a/requirements/requirements.in b/requirements/requirements.in index 2a60f6a53..783c55acf 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -19,7 +19,6 @@ beautifulsoup4 PyYAML Pillow diff-match-patch -playwright pypandoc litellm flake8 diff --git a/scripts/pip-compile.sh b/scripts/pip-compile.sh index ffc0e9784..7efa35f95 100755 --- a/scripts/pip-compile.sh +++ b/scripts/pip-compile.sh @@ -8,7 +8,7 @@ pip-compile \ --output-file=requirements.txt \ $1 -for SUFFIX in dev hf-embed browser; do +for SUFFIX in dev hf-embed browser playwright; do pip-compile \ requirements/requirements-${SUFFIX}.in \ diff --git a/setup.py b/setup.py index 83dbf56d7..3f1568a3e 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ requirements = get_requirements() dev_requirements = get_requirements("dev") hf_requirements = get_requirements("hf-embed") browser_requirements = get_requirements("browser") +playwright_requirements = get_requirements("playwright") # README with open("README.md", "r", encoding="utf-8") as f: @@ -49,6 +50,7 @@ setup( "dev": dev_requirements, "hf-embed": hf_requirements, "browser": browser_requirements, + "playwright": playwright_requirements, }, python_requires=">=3.9,<3.13", entry_points={