Print playwright instructions after the content is displayed, so they are not lost

This commit is contained in:
Paul Gauthier 2024-02-08 12:01:18 -08:00
parent e42005a78e
commit 681f26d010
2 changed files with 20 additions and 7 deletions

View file

@ -38,7 +38,9 @@ class Commands:
self.scraper = Scraper(print_error=self.io.tool_error)
content = self.scraper.scrape(url)
self.io.tool_output(content)
if content:
self.io.tool_output(content)
self.scraper.show_playwright_instructions()
return content

View file

@ -2,6 +2,7 @@
import sys
import httpx
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
@ -20,6 +21,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info.
class Scraper:
playwright_available = None
playwright_instructions_shown = False
def __init__(self, print_error=None):
if print_error:
@ -51,17 +53,26 @@ class Scraper:
return content
def try_playwright(self):
if self.playwright_available is not None:
return
with sync_playwright() as p:
try:
p.chromium.launch()
self.playwright_available = True
except Exception:
self.playwright_available = False
self.print_error(PLAYWRIGHT_INFO)
def show_playwright_instructions(self):
if self.playwright_available in (True, None):
return
if self.playwright_instructions_shown:
return
self.playwright_instructions_shown = True
self.print_error(PLAYWRIGHT_INFO)
def scrape_with_httpx(self, url):
import httpx
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
try:
with httpx.Client(headers=headers) as client:
@ -75,15 +86,15 @@ class Scraper:
return None
def scrape(self, url):
if self.playwright_available is None:
self.try_playwright()
self.try_playwright()
if self.playwright_available:
content = self.scrape_with_playwright(url)
else:
content = self.scrape_with_httpx(url)
content = html_to_text(content)
if content:
content = html_to_text(content)
return content