Skip to content

Commit

Permalink
feat(playwright): support get/load state
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Jan 26, 2025
1 parent 5cc479a commit 47aa63c
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 12 deletions.
7 changes: 5 additions & 2 deletions npiai/core/__test__/captcha_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ async def web_interaction(
playwright: PlaywrightContext,
) -> str:
print(f"[HITL] web_interaction: {message=}, {url=}, {action=}")
await playwright.restore_state(await playwright.get_state())
return "web_interaction"


Expand All @@ -59,10 +60,12 @@ async def main():
ctx = DebugContext()
ctx.use_hitl(TestHITL())

async with BrowserTool() as tool:
async with BrowserTool(headless=False) as tool:
for url in urls:
await tool.load_page(ctx, url)
captcha_type = await tool.detect_captcha(ctx)
captcha_type = await tool.detect_captcha(
ctx, return_to="https://google.com"
)
print(f"{url}: {captcha_type}")


Expand Down
39 changes: 33 additions & 6 deletions npiai/core/browser/_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,32 @@ async def start(self):
# args=["--disable-gpu", "--single-process"],
)

await self.restore_state(self.storage_state)

self.ready = True

async def get_state(self) -> StorageState:
return await self.context.storage_state()

async def restore_state(self, state: str | pathlib.Path | StorageState):
"""
Restore the browser state from a previously saved state
Args:
state: Previously saved state to use for the browser context
"""
# clean up the previous context
if self.page:
self.detach_events(self.page)
self.page = None
if self.context:
await self.context.close()
self.context = None

self.context = await self.browser.new_context(
locale="en-US",
bypass_csp=True,
storage_state=self.storage_state,
storage_state=state,
**self.playwright.devices["Desktop Chrome"],
)
# self.context.set_default_timeout(3000)
Expand Down Expand Up @@ -118,15 +140,20 @@ def block_route(route):
self.page = await self.context.new_page()
self.attach_events(self.page)

self.ready = True

def attach_events(self, page: Page):
page.on("dialog", self.on_dialog)
page.on("download", self.on_download)
page.on("filechooser", self.on_filechooser)
page.on("popup", self.on_popup)
page.on("close", self.on_close)

def detach_events(self, page: Page):
page.remove_listener("dialog", self.on_dialog)
page.remove_listener("download", self.on_download)
page.remove_listener("filechooser", self.on_filechooser)
page.remove_listener("popup", self.on_popup)
page.remove_listener("close", self.on_close)

async def on_dialog(self, dialog: Dialog):
"""
Callback function invoked when a dialog is opened
Expand Down Expand Up @@ -165,12 +192,12 @@ async def on_popup(self, popup: Page):
self.page = popup
self.attach_events(popup)

async def on_close(self, _):
async def on_close(self, page: Page):
"""
Callback function invoked when the page is closed
"""
if self.context.pages:
self.page = self.context.pages[-1]
if page.context.pages:
self.page = page.context.pages[-1]
self.attach_events(self.page)

async def stop(self):
Expand Down
12 changes: 10 additions & 2 deletions npiai/core/tool/_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ async def load_page(
# await self.playwright.page.wait_for_timeout(wait)

if force_capcha_detection:
await self.detect_captcha(ctx)
await self.detect_captcha(ctx, return_to=url)

@function
async def get_text(self):
Expand Down Expand Up @@ -288,7 +288,7 @@ async def back_to_top(self):

return f"Successfully scrolled to top"

async def detect_captcha(self, ctx: Context):
async def detect_captcha(self, ctx: Context, return_to: str | None = None):
url = await self.get_page_url()
screenshot = await self.get_screenshot(full_page=True, max_size=(1280, 720))

Expand Down Expand Up @@ -317,6 +317,14 @@ async def handle_captcha(captcha_type: Literal["none", "captcha", "login"]):
playwright=self.playwright,
)

if (
captcha_type != "none"
and return_to
and self.playwright.page.url != return_to
):
# TODO: do we need to check captcha again after returning to the original page?
await self.load_page(ctx, return_to)

return captcha_type

res = await llm_tool_call(
Expand Down
6 changes: 4 additions & 2 deletions npiai/tools/scrapers/web/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ async def next_items(

if not res:
# if no items are found, check if there are any captcha
await self.detect_captcha(ctx)
self._all_items_loaded = True
captcha = await self.detect_captcha(ctx, return_to=self.url)

if captcha == "none":
self._all_items_loaded = True

return res

Expand Down

0 comments on commit 47aa63c

Please sign in to comment.