From ef12107bd7dfbf724359549114cb076ae8eab146 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 7 Feb 2024 14:22:02 -0800 Subject: [PATCH] aider: Add command /web to use headless selenium to scrape a webpage and add the content to the chat. # Aider chat conversation: USER: Add a command /web to use headless selenium to scrape a webpage and add the content to the chat ASSISTANT: To implement the `/web` command, we will need to add a new method to the `Commands` class in the `aider/commands.py` file. This method will use Selenium with a headless browser to scrape the content of a given webpage and add the content to the chat. Here are the diffs for the changes: ```diff --- aider/commands.py +++ aider/commands.py @@ ... @@ from pathlib import Path +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager import re import subprocess @@ ... @@ class Commands: voice = None + web_driver = None def __init__(self, io, coder, voice_language=None): self.io = io @@ ... @@ self.voice_language = voice_language self.tokenizer = coder.main_model.tokenizer + self.initialize_web_driver() + def initialize_web_driver(self): + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.web_driver = webdriver.Chrome( + service=Service(ChromeDriverManager().install()), + options=chrome_options + ) + + def cmd_web(self, args): + "Use headless selenium to scrape a webpage and add the content to the chat" + url = args.strip() + if not url: + self.io.tool_error("Please provide a URL to scrape.") + return + + try: + self.web_driver.get(url) + page_content = self.web_driver.find_element(By.TAG_NAME, "body").text + self.io.tool_output(f"Content from {url}:\n{page_content}") + return page_content + except Exception as e: + self.io.tool_error(f"Error scraping {url}: {e}") + def is_command(self, inp): return inp[0] in "/!" @@ ... @@ ``` This diff adds the necessary imports for Selenium and the WebDriver manager, initializes a headless Chrome WebDriver when the `Commands` class is instantiated, and adds a new command method `cmd_web` that takes a URL as an argument, scrapes the webpage, and outputs the content to the chat. --- aider/commands.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/aider/commands.py b/aider/commands.py index 2b2dcf6af..12ecd63ba 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -1,3 +1,9 @@ +from pathlib import Path +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager import re import subprocess import sys @@ -14,6 +20,7 @@ from .dump import dump # noqa: F401 class Commands: voice = None + web_driver = None def __init__(self, io, coder, voice_language=None): self.io = io @@ -24,6 +31,33 @@ class Commands: self.voice_language = voice_language self.tokenizer = coder.main_model.tokenizer + self.initialize_web_driver() + + def initialize_web_driver(self): + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.web_driver = webdriver.Chrome( + service=Service(ChromeDriverManager().install()), + options=chrome_options + ) + + def cmd_web(self, args): + "Use headless selenium to scrape a webpage and add the content to the chat" + url = args.strip() + if not url: + self.io.tool_error("Please provide a URL to scrape.") + return + + try: + self.web_driver.get(url) + page_content = self.web_driver.find_element(By.TAG_NAME, "body").text + self.io.tool_output(f"Content from {url}:\n{page_content}") + return page_content + except Exception as e: + self.io.tool_error(f"Error scraping {url}: {e}") def is_command(self, inp): return inp[0] in "/!"