aider: Add command /web to use headless selenium to scrape a webpage and add the content to the chat.

# Aider chat conversation:

USER: Add a command /web to use headless selenium to scrape a webpage and add the content to the chat

ASSISTANT: To implement the `/web` command, we will need to add a new method to the `Commands` class in the `aider/commands.py` file. This method will use Selenium with a headless browser to scrape the content of a given webpage and add the content to the chat.

Here are the diffs for the changes:

```diff
--- aider/commands.py
+++ aider/commands.py
@@ ... @@
 from pathlib import Path
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
 import re
 import subprocess
@@ ... @@
 class Commands:
     voice = None
+    web_driver = None

     def __init__(self, io, coder, voice_language=None):
         self.io = io
@@ ... @@
         self.voice_language = voice_language
         self.tokenizer = coder.main_model.tokenizer
+        self.initialize_web_driver()

+    def initialize_web_driver(self):
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        self.web_driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()),
+            options=chrome_options
+        )
+
+    def cmd_web(self, args):
+        "Use headless selenium to scrape a webpage and add the content to the chat"
+        url = args.strip()
+        if not url:
+            self.io.tool_error("Please provide a URL to scrape.")
+            return
+
+        try:
+            self.web_driver.get(url)
+            page_content = self.web_driver.find_element(By.TAG_NAME, "body").text
+            self.io.tool_output(f"Content from {url}:\n{page_content}")
+            return page_content
+        except Exception as e:
+            self.io.tool_error(f"Error scraping {url}: {e}")
+
     def is_command(self, inp):
         return inp[0] in "/!"
@@ ... @@
```

This diff adds the necessary imports for Selenium and the WebDriver manager, initializes a headless Chrome WebDriver when the `Commands` class is instantiated, and adds a new command method `cmd_web` that takes a URL as an argument, scrapes the webpage, and outputs the content to the chat.
This commit is contained in:
Paul Gauthier 2024-02-07 14:22:02 -08:00
parent 8d9c2ba60b
commit ef12107bd7

View file

@ -1,3 +1,9 @@
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import re
import subprocess
import sys
@ -14,6 +20,7 @@ from .dump import dump # noqa: F401
class Commands:
voice = None
web_driver = None
def __init__(self, io, coder, voice_language=None):
self.io = io
@ -24,6 +31,33 @@ class Commands:
self.voice_language = voice_language
self.tokenizer = coder.main_model.tokenizer
self.initialize_web_driver()
def initialize_web_driver(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
self.web_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
def cmd_web(self, args):
"Use headless selenium to scrape a webpage and add the content to the chat"
url = args.strip()
if not url:
self.io.tool_error("Please provide a URL to scrape.")
return
try:
self.web_driver.get(url)
page_content = self.web_driver.find_element(By.TAG_NAME, "body").text
self.io.tool_output(f"Content from {url}:\n{page_content}")
return page_content
except Exception as e:
self.io.tool_error(f"Error scraping {url}: {e}")
def is_command(self, inp):
return inp[0] in "/!"