mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-29 08:44:59 +00:00
Print playwright instructions after the content is displayed, so they are not lost
This commit is contained in:
parent
e42005a78e
commit
681f26d010
2 changed files with 20 additions and 7 deletions
|
@ -38,7 +38,9 @@ class Commands:
|
||||||
self.scraper = Scraper(print_error=self.io.tool_error)
|
self.scraper = Scraper(print_error=self.io.tool_error)
|
||||||
|
|
||||||
content = self.scraper.scrape(url)
|
content = self.scraper.scrape(url)
|
||||||
self.io.tool_output(content)
|
if content:
|
||||||
|
self.io.tool_output(content)
|
||||||
|
self.scraper.show_playwright_instructions()
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
@ -20,6 +21,7 @@ See https://aider.chat/docs/install.html#enable-playwright for more info.
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
playwright_available = None
|
playwright_available = None
|
||||||
|
playwright_instructions_shown = False
|
||||||
|
|
||||||
def __init__(self, print_error=None):
|
def __init__(self, print_error=None):
|
||||||
if print_error:
|
if print_error:
|
||||||
|
@ -51,17 +53,26 @@ class Scraper:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def try_playwright(self):
|
def try_playwright(self):
|
||||||
|
if self.playwright_available is not None:
|
||||||
|
return
|
||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
try:
|
try:
|
||||||
p.chromium.launch()
|
p.chromium.launch()
|
||||||
self.playwright_available = True
|
self.playwright_available = True
|
||||||
except Exception:
|
except Exception:
|
||||||
self.playwright_available = False
|
self.playwright_available = False
|
||||||
self.print_error(PLAYWRIGHT_INFO)
|
|
||||||
|
def show_playwright_instructions(self):
|
||||||
|
if self.playwright_available in (True, None):
|
||||||
|
return
|
||||||
|
if self.playwright_instructions_shown:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.playwright_instructions_shown = True
|
||||||
|
self.print_error(PLAYWRIGHT_INFO)
|
||||||
|
|
||||||
def scrape_with_httpx(self, url):
|
def scrape_with_httpx(self, url):
|
||||||
import httpx
|
|
||||||
|
|
||||||
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
|
headers = {"User-Agent": f"Mozilla./5.0 ({aider_user_agent})"}
|
||||||
try:
|
try:
|
||||||
with httpx.Client(headers=headers) as client:
|
with httpx.Client(headers=headers) as client:
|
||||||
|
@ -75,15 +86,15 @@ class Scraper:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def scrape(self, url):
|
def scrape(self, url):
|
||||||
if self.playwright_available is None:
|
self.try_playwright()
|
||||||
self.try_playwright()
|
|
||||||
|
|
||||||
if self.playwright_available:
|
if self.playwright_available:
|
||||||
content = self.scrape_with_playwright(url)
|
content = self.scrape_with_playwright(url)
|
||||||
else:
|
else:
|
||||||
content = self.scrape_with_httpx(url)
|
content = self.scrape_with_httpx(url)
|
||||||
|
|
||||||
content = html_to_text(content)
|
if content:
|
||||||
|
content = html_to_text(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue