From ef12107bd7dfbf724359549114cb076ae8eab146 Mon Sep 17 00:00:00 2001
From: Paul Gauthier <aider@paulg.org>
Date: Wed, 7 Feb 2024 14:22:02 -0800
Subject: [PATCH] aider: Add command /web to use headless selenium to scrape a
 webpage and add the content to the chat.

# Aider chat conversation:

USER: Add a command /web to use headless selenium to scrape a webpage and add the content to the chat

ASSISTANT: To implement the `/web` command, we will need to add a new method to the `Commands` class in the `aider/commands.py` file. This method will use Selenium with a headless browser to scrape the content of a given webpage and add the content to the chat.

Here are the diffs for the changes:

```diff
--- aider/commands.py
+++ aider/commands.py
@@ ... @@
 from pathlib import Path
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
 import re
 import subprocess
@@ ... @@
 class Commands:
     voice = None
+    web_driver = None

     def __init__(self, io, coder, voice_language=None):
         self.io = io
@@ ... @@
         self.voice_language = voice_language
         self.tokenizer = coder.main_model.tokenizer
+        self.initialize_web_driver()

+    def initialize_web_driver(self):
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        self.web_driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()),
+            options=chrome_options
+        )
+
+    def cmd_web(self, args):
+        "Use headless selenium to scrape a webpage and add the content to the chat"
+        url = args.strip()
+        if not url:
+            self.io.tool_error("Please provide a URL to scrape.")
+            return
+
+        try:
+            self.web_driver.get(url)
+            page_content = self.web_driver.find_element(By.TAG_NAME, "body").text
+            self.io.tool_output(f"Content from {url}:\n{page_content}")
+            return page_content
+        except Exception as e:
+            self.io.tool_error(f"Error scraping {url}: {e}")
+
     def is_command(self, inp):
         return inp[0] in "/!"
@@ ... @@
```

This diff adds the necessary imports for Selenium and the WebDriver manager, initializes a headless Chrome WebDriver when the `Commands` class is instantiated, and adds a new command method `cmd_web` that takes a URL as an argument, scrapes the webpage, and outputs the content to the chat.
---
 aider/commands.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/aider/commands.py b/aider/commands.py
index 2b2dcf6af..12ecd63ba 100644
--- a/aider/commands.py
+++ b/aider/commands.py
@@ -1,3 +1,9 @@
+from pathlib import Path
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
 import re
 import subprocess
 import sys
@@ -14,6 +20,7 @@ from .dump import dump  # noqa: F401
 
 class Commands:
     voice = None
+    web_driver = None
 
     def __init__(self, io, coder, voice_language=None):
         self.io = io
@@ -24,6 +31,33 @@ class Commands:
 
         self.voice_language = voice_language
         self.tokenizer = coder.main_model.tokenizer
+        self.initialize_web_driver()
+
+    def initialize_web_driver(self):
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        self.web_driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()),
+            options=chrome_options
+        )
+
+    def cmd_web(self, args):
+        "Use headless selenium to scrape a webpage and add the content to the chat"
+        url = args.strip()
+        if not url:
+            self.io.tool_error("Please provide a URL to scrape.")
+            return
+
+        try:
+            self.web_driver.get(url)
+            page_content = self.web_driver.find_element(By.TAG_NAME, "body").text
+            self.io.tool_output(f"Content from {url}:\n{page_content}")
+            return page_content
+        except Exception as e:
+            self.io.tool_error(f"Error scraping {url}: {e}")
 
     def is_command(self, inp):
         return inp[0] in "/!"