feat(BrowserTool): add captcha detection

npi-ai · Jan 25, 2025 · 0abd102 · 0abd102
1 parent 57baf75
commit 0abd102
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 24 deletions.
diff --git a/npiai/core/__test__/__init__.py b/npiai/core/__test__/__init__.py
diff --git a/npiai/core/__test__/captcha_detection.py b/npiai/core/__test__/captcha_detection.py
@@ -0,0 +1,66 @@
+import asyncio
+
+from npiai import BrowserTool, HITL
+from npiai.utils.test_utils import DebugContext
+
+
+class TestHITL(HITL):
+    async def confirm(
+        self,
+        tool_name: str,
+        message: str,
+        default=False,
+    ) -> bool:
+        print(f"[HITL] confirm: {message=}, {default=}")
+        return True
+
+    async def input(
+        self,
+        tool_name: str,
+        message: str,
+        default="",
+    ) -> str:
+        print(f"[HITL] input: {message=}, {default=}")
+        return "input"
+
+    async def select(
+        self,
+        tool_name: str,
+        message: str,
+        choices: list[str],
+        default="",
+    ) -> str:
+        print(f"[HITL] select: {message=}, {choices=}, {default=}")
+        return "select"
+
+    async def web_interaction(
+        self,
+        tool_name: str,
+        message: str,
+        url: str,
+    ) -> str:
+        print(f"[HITL] web_interaction: {message=}, {url=}")
+        return "web_interaction"
+
+
+urls = [
+    "https://www.google.com/recaptcha/api2/demo",
+    "https://nopecha.com/captcha/turnstile",
+    "https://github.com/login",
+    "https://google.com",
+]
+
+
+async def main():
+    ctx = DebugContext()
+    ctx.use_hitl(TestHITL())
+
+    async with BrowserTool() as tool:
+        for url in urls:
+            await tool.load_page(ctx, url)
+            captcha_type = await tool.detect_captcha(ctx)
+            print(f"{url}: {captcha_type}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/npiai/core/hitl.py b/npiai/core/hitl.py
@@ -33,4 +33,12 @@ async def select(
         message: str,
         choices: List[str],
         default="",
-    ): ...
+    ) -> str: ...
+
+    @abstractmethod
+    async def web_interaction(
+        self,
+        tool_name: str,
+        message: str,
+        url: str,
+    ) -> str: ...
diff --git a/npiai/core/tool/_browser.py b/npiai/core/tool/_browser.py
@@ -1,11 +1,19 @@
 import base64
 import io
+from textwrap import dedent
+from typing import Literal
 
 from PIL import Image
 from playwright.async_api import ElementHandle, TimeoutError
 
+from litellm.types.completion import (
+    ChatCompletionSystemMessageParam,
+    ChatCompletionUserMessageParam,
+)
+
+from npiai.context import Context
 from npiai.core.browser import PlaywrightContext
-from npiai.utils import html_to_markdown
+from npiai.utils import html_to_markdown, llm_tool_call
 
 from ._function import FunctionTool, function
 
@@ -34,27 +42,39 @@ def __init__(
 
     async def load_page(
         self,
+        ctx: Context,
         url: str,
-        timeout: int | None = None,
         wait_for_selector: str = None,
+        network_idle_timeout: int | None = None,
+        force_capcha_detection: bool = False,
     ):
         await self.playwright.page.goto(url)
 
-        try:
-            if wait_for_selector is not None:
+        if wait_for_selector is not None:
+            try:
                 locator = self.playwright.page.locator(wait_for_selector)
-                await locator.first.wait_for(state="attached", timeout=timeout)
-            # wait for the page to become stable
-            elif timeout is not None:
+                await locator.first.wait_for(
+                    state="attached", timeout=network_idle_timeout
+                )
+            except TimeoutError:
+                await self.detect_captcha(ctx)
+        # wait for the page to become stable
+        elif network_idle_timeout is not None:
+            try:
                 await self.playwright.page.wait_for_load_state(
                     "networkidle",
-                    timeout=timeout,
+                    timeout=network_idle_timeout,
                 )
-        except TimeoutError:
-            pass
+            except TimeoutError:
+                pass
 
         # await self.playwright.page.wait_for_timeout(wait)
 
+        # capcha detection will be done (or unnecessary if elements matched) if selector is provided
+        # so we only do it if no selector is provided
+        if not wait_for_selector and force_capcha_detection:
+            await self.detect_captcha(ctx)
+
     @function
     async def get_text(self):
         """Get the text content (as markdown) of the current page"""
@@ -270,3 +290,58 @@ async def back_to_top(self):
         await self.playwright.page.wait_for_timeout(300)
 
         return f"Successfully scrolled to top"
+
+    async def detect_captcha(self, ctx: Context):
+        url = await self.get_page_url()
+        screenshot = await self.get_screenshot(full_page=True, max_size=(1280, 720))
+
+        async def handle_captcha(captcha_type: Literal["none", "captcha", "login"]):
+            """
+            Handle the captcha detection result
+
+            Args:
+                captcha_type: "none" if no captcha is detected, "captcha" if a captcha is detected, "login" if a login form is detected
+            """
+            match captcha_type:
+                case "captcha":
+                    await ctx.hitl.web_interaction(
+                        tool_name=self.name,
+                        message="Would you please help me solve the captcha?",
+                        url=url,
+                    )
+                case "login":
+                    await ctx.hitl.web_interaction(
+                        tool_name=self.name,
+                        message="Would you please help me login to the website?",
+                        url=url,
+                    )
+
+            return captcha_type
+
+        res = await llm_tool_call(
+            llm=ctx.llm,
+            tool=handle_captcha,
+            messages=[
+                ChatCompletionSystemMessageParam(
+                    role="system",
+                    content=dedent(
+                        """
+                        You are given a screenshot of a webpage. Determine if a captcha or login form is present in the screenshot. If a captcha is present, call the tool with the argument "captcha". If a login form is present, call the tool with the argument "login". If neither is present, call the tool with the argument "none".
+                        """
+                    ),
+                ),
+                ChatCompletionUserMessageParam(
+                    role="user",
+                    content=[
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": screenshot,
+                            },
+                        }
+                    ],
+                ),
+            ],
+        )
+
+        return await handle_captcha(**res.model_dump())
diff --git a/npiai/tools/scrapers/page_analyzer/app.py b/npiai/tools/scrapers/page_analyzer/app.py
@@ -36,6 +36,12 @@ class PageAnalyzer(BrowserTool):
         """
     )
 
+    force_captcha_detection: bool
+
+    def __init__(self, force_captcha_detection: bool = False, **kwargs):
+        super().__init__(**kwargs)
+        self.force_captcha_detection = force_captcha_detection
+
     async def _validate_pagination(
         self,
         ctx: Context,
@@ -250,21 +256,25 @@ async def compute_common_selectors(
     @function
     async def support_infinite_scroll(
         self,
+        ctx: Context,
         url: str,
         items_selector: str | None = None,
     ) -> bool:
         """
         Open the given URL and determine whether the page supports infinite scroll.
 
         Args:
+            ctx: NPi Context
             url: URL of the page
             items_selector: CSS selector of the items on the page
         """
         # use long wait time for pages to be fully loaded
         await self.load_page(
-            url,
-            timeout=3000,
+            ctx=ctx,
+            url=url,
+            network_idle_timeout=3000,
             wait_for_selector=items_selector,
+            force_capcha_detection=self.force_captcha_detection,
         )
 
         return await self.playwright.page.evaluate(
@@ -340,7 +350,10 @@ async def support_infinite_scroll(
 
     @function
     async def get_pagination_button(
-        self, ctx: Context, url: str, items_selector: str | None = None
+        self,
+        ctx: Context,
+        url: str,
+        items_selector: str | None = None,
     ) -> str | None:
         """
         Open the given URL and determine whether there is a pagination button. If there is, return the CSS selector of the pagination button. Otherwise, return None.
@@ -350,7 +363,9 @@ async def get_pagination_button(
             url: URL of the page
             items_selector: CSS selector of the items on the page
         """
-        await self.load_page(url)
+        await self.load_page(
+            ctx, url, force_capcha_detection=self.force_captcha_detection
+        )
 
         # use latest page url in case of redirections
         page_url = await self.get_page_url()
@@ -465,7 +480,10 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> ScrapingType:
             ctx: NPi Context
             url: URL of the page
         """
-        await self.load_page(url)
+        await self.load_page(
+            ctx, url, force_capcha_detection=self.force_captcha_detection
+        )
+
         page_url = await self.get_page_url()
         page_title = await self.get_page_title()
         screenshot = await self.get_screenshot(
@@ -546,7 +564,12 @@ async def infer_similar_items_selector(
             ctx: NPi Context
             url: URL of the page
         """
-        await self.load_page(url, timeout=3000)
+        await self.load_page(
+            ctx,
+            url,
+            network_idle_timeout=3000,
+            force_capcha_detection=self.force_captcha_detection,
+        )
 
         # use latest page url in case of redirections
         page_url = await self.get_page_url()

diff --git a/npiai/tools/scrapers/web/__test__/interactive.py b/npiai/tools/scrapers/web/__test__/interactive.py
@@ -5,13 +5,56 @@
 # from npiai import Context
 from utils import auto_scrape
 
+from npiai import HITL
+
+
+class TestHITL(HITL):
+    async def confirm(
+        self,
+        tool_name: str,
+        message: str,
+        default=False,
+    ) -> bool:
+        print(f"[HITL] confirm: {message=}, {default=}")
+        return True
+
+    async def input(
+        self,
+        tool_name: str,
+        message: str,
+        default="",
+    ) -> str:
+        print(f"[HITL] input: {message=}, {default=}")
+        return "input"
+
+    async def select(
+        self,
+        tool_name: str,
+        message: str,
+        choices: list[str],
+        default="",
+    ) -> str:
+        print(f"[HITL] select: {message=}, {choices=}, {default=}")
+        return "select"
+
+    async def web_interaction(
+        self,
+        tool_name: str,
+        message: str,
+        url: str,
+    ) -> str:
+        print(f"[HITL] web_interaction: {message=}, {url=}")
+        return "web_interaction"
+
 
 async def main():
     url = input("Enter the URL: ")
+    ctx = DebugContext()
+    ctx.use_hitl(TestHITL())
     # url = "https://www.bardeen.ai/playbooks"
 
     await auto_scrape(
-        ctx=DebugContext(),
+        ctx=ctx,
         url=url,
     )
 

diff --git a/npiai/tools/scrapers/web/__test__/utils.py b/npiai/tools/scrapers/web/__test__/utils.py
@@ -13,7 +13,7 @@ async def auto_scrape(
     ctx: Context,
     url: str,
 ):
-    async with PageAnalyzer(headless=False) as analyzer:
+    async with PageAnalyzer(headless=False, force_captcha_detection=True) as analyzer:
         ancestor_selector = None
         items_selector = None
 
@@ -46,6 +46,7 @@ async def auto_scrape(
         step_start_time = time.monotonic()
 
         infinite_scroll = await analyzer.support_infinite_scroll(
+            ctx=ctx,
             url=url,
             items_selector=items_selector,
         )
@@ -89,7 +90,7 @@ async def auto_scrape(
         stream = scraper.summarize_stream(
             ctx=ctx,
             output_columns=columns,
-            limit=1 if scraping_type == "single" else 100,
+            limit=1,
             batch_size=5,
             concurrency=10,
         )