From 47aa63cfb052bce8fb2e4123d23b6d4f7abca16f Mon Sep 17 00:00:00 2001 From: Daofeng Wu Date: Sun, 26 Jan 2025 13:48:04 +0900 Subject: [PATCH] feat(playwright): support get/load state --- npiai/core/__test__/captcha_detection.py | 7 +++-- npiai/core/browser/_playwright.py | 39 ++++++++++++++++++++---- npiai/core/tool/_browser.py | 12 ++++++-- npiai/tools/scrapers/web/app.py | 6 ++-- 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/npiai/core/__test__/captcha_detection.py b/npiai/core/__test__/captcha_detection.py index 4457b97..5274272 100644 --- a/npiai/core/__test__/captcha_detection.py +++ b/npiai/core/__test__/captcha_detection.py @@ -44,6 +44,7 @@ async def web_interaction( playwright: PlaywrightContext, ) -> str: print(f"[HITL] web_interaction: {message=}, {url=}, {action=}") + await playwright.restore_state(await playwright.get_state()) return "web_interaction" @@ -59,10 +60,12 @@ async def main(): ctx = DebugContext() ctx.use_hitl(TestHITL()) - async with BrowserTool() as tool: + async with BrowserTool(headless=False) as tool: for url in urls: await tool.load_page(ctx, url) - captcha_type = await tool.detect_captcha(ctx) + captcha_type = await tool.detect_captcha( + ctx, return_to="https://google.com" + ) print(f"{url}: {captcha_type}") diff --git a/npiai/core/browser/_playwright.py b/npiai/core/browser/_playwright.py index 8573a21..ebd5862 100644 --- a/npiai/core/browser/_playwright.py +++ b/npiai/core/browser/_playwright.py @@ -83,10 +83,32 @@ async def start(self): # args=["--disable-gpu", "--single-process"], ) + await self.restore_state(self.storage_state) + + self.ready = True + + async def get_state(self) -> StorageState: + return await self.context.storage_state() + + async def restore_state(self, state: str | pathlib.Path | StorageState): + """ + Restore the browser state from a previously saved state + + Args: + state: Previously saved state to use for the browser context + """ + # clean up the previous context + if self.page: + self.detach_events(self.page) + self.page = None + if self.context: + await self.context.close() + self.context = None + self.context = await self.browser.new_context( locale="en-US", bypass_csp=True, - storage_state=self.storage_state, + storage_state=state, **self.playwright.devices["Desktop Chrome"], ) # self.context.set_default_timeout(3000) @@ -118,8 +140,6 @@ def block_route(route): self.page = await self.context.new_page() self.attach_events(self.page) - self.ready = True - def attach_events(self, page: Page): page.on("dialog", self.on_dialog) page.on("download", self.on_download) @@ -127,6 +147,13 @@ def attach_events(self, page: Page): page.on("popup", self.on_popup) page.on("close", self.on_close) + def detach_events(self, page: Page): + page.remove_listener("dialog", self.on_dialog) + page.remove_listener("download", self.on_download) + page.remove_listener("filechooser", self.on_filechooser) + page.remove_listener("popup", self.on_popup) + page.remove_listener("close", self.on_close) + async def on_dialog(self, dialog: Dialog): """ Callback function invoked when a dialog is opened @@ -165,12 +192,12 @@ async def on_popup(self, popup: Page): self.page = popup self.attach_events(popup) - async def on_close(self, _): + async def on_close(self, page: Page): """ Callback function invoked when the page is closed """ - if self.context.pages: - self.page = self.context.pages[-1] + if page.context.pages: + self.page = page.context.pages[-1] self.attach_events(self.page) async def stop(self): diff --git a/npiai/core/tool/_browser.py b/npiai/core/tool/_browser.py index 1814013..328c8d3 100644 --- a/npiai/core/tool/_browser.py +++ b/npiai/core/tool/_browser.py @@ -70,7 +70,7 @@ async def load_page( # await self.playwright.page.wait_for_timeout(wait) if force_capcha_detection: - await self.detect_captcha(ctx) + await self.detect_captcha(ctx, return_to=url) @function async def get_text(self): @@ -288,7 +288,7 @@ async def back_to_top(self): return f"Successfully scrolled to top" - async def detect_captcha(self, ctx: Context): + async def detect_captcha(self, ctx: Context, return_to: str | None = None): url = await self.get_page_url() screenshot = await self.get_screenshot(full_page=True, max_size=(1280, 720)) @@ -317,6 +317,14 @@ async def handle_captcha(captcha_type: Literal["none", "captcha", "login"]): playwright=self.playwright, ) + if ( + captcha_type != "none" + and return_to + and self.playwright.page.url != return_to + ): + # TODO: do we need to check captcha again after returning to the original page? + await self.load_page(ctx, return_to) + return captcha_type res = await llm_tool_call( diff --git a/npiai/tools/scrapers/web/app.py b/npiai/tools/scrapers/web/app.py index 0692fa7..148c240 100644 --- a/npiai/tools/scrapers/web/app.py +++ b/npiai/tools/scrapers/web/app.py @@ -105,8 +105,10 @@ async def next_items( if not res: # if no items are found, check if there are any captcha - await self.detect_captcha(ctx) - self._all_items_loaded = True + captcha = await self.detect_captcha(ctx, return_to=self.url) + + if captcha == "none": + self._all_items_loaded = True return res