diff --git a/npiai/core/__test__/__init__.py b/npiai/core/__test__/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/npiai/core/__test__/captcha_detection.py b/npiai/core/__test__/captcha_detection.py new file mode 100644 index 00000000..960bf9f1 --- /dev/null +++ b/npiai/core/__test__/captcha_detection.py @@ -0,0 +1,66 @@ +import asyncio + +from npiai import BrowserTool, HITL +from npiai.utils.test_utils import DebugContext + + +class TestHITL(HITL): + async def confirm( + self, + tool_name: str, + message: str, + default=False, + ) -> bool: + print(f"[HITL] confirm: {message=}, {default=}") + return True + + async def input( + self, + tool_name: str, + message: str, + default="", + ) -> str: + print(f"[HITL] input: {message=}, {default=}") + return "input" + + async def select( + self, + tool_name: str, + message: str, + choices: list[str], + default="", + ) -> str: + print(f"[HITL] select: {message=}, {choices=}, {default=}") + return "select" + + async def web_interaction( + self, + tool_name: str, + message: str, + url: str, + ) -> str: + print(f"[HITL] web_interaction: {message=}, {url=}") + return "web_interaction" + + +urls = [ + "https://www.google.com/recaptcha/api2/demo", + "https://nopecha.com/captcha/turnstile", + "https://github.com/login", + "https://google.com", +] + + +async def main(): + ctx = DebugContext() + ctx.use_hitl(TestHITL()) + + async with BrowserTool() as tool: + for url in urls: + await tool.load_page(ctx, url) + captcha_type = await tool.detect_captcha(ctx) + print(f"{url}: {captcha_type}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/npiai/core/hitl.py b/npiai/core/hitl.py index b57d0f68..10785451 100644 --- a/npiai/core/hitl.py +++ b/npiai/core/hitl.py @@ -33,4 +33,12 @@ async def select( message: str, choices: List[str], default="", - ): ... + ) -> str: ... + + @abstractmethod + async def web_interaction( + self, + tool_name: str, + message: str, + url: str, + ) -> str: ... diff --git a/npiai/core/tool/_browser.py b/npiai/core/tool/_browser.py index 835c5c80..c24270d3 100644 --- a/npiai/core/tool/_browser.py +++ b/npiai/core/tool/_browser.py @@ -1,11 +1,19 @@ import base64 import io +from textwrap import dedent +from typing import Literal from PIL import Image from playwright.async_api import ElementHandle, TimeoutError +from litellm.types.completion import ( + ChatCompletionSystemMessageParam, + ChatCompletionUserMessageParam, +) + +from npiai.context import Context from npiai.core.browser import PlaywrightContext -from npiai.utils import html_to_markdown +from npiai.utils import html_to_markdown, llm_tool_call from ._function import FunctionTool, function @@ -34,27 +42,39 @@ def __init__( async def load_page( self, + ctx: Context, url: str, - timeout: int | None = None, wait_for_selector: str = None, + network_idle_timeout: int | None = None, + force_capcha_detection: bool = False, ): await self.playwright.page.goto(url) - try: - if wait_for_selector is not None: + if wait_for_selector is not None: + try: locator = self.playwright.page.locator(wait_for_selector) - await locator.first.wait_for(state="attached", timeout=timeout) - # wait for the page to become stable - elif timeout is not None: + await locator.first.wait_for( + state="attached", timeout=network_idle_timeout + ) + except TimeoutError: + await self.detect_captcha(ctx) + # wait for the page to become stable + elif network_idle_timeout is not None: + try: await self.playwright.page.wait_for_load_state( "networkidle", - timeout=timeout, + timeout=network_idle_timeout, ) - except TimeoutError: - pass + except TimeoutError: + pass # await self.playwright.page.wait_for_timeout(wait) + # capcha detection will be done (or unnecessary if elements matched) if selector is provided + # so we only do it if no selector is provided + if not wait_for_selector and force_capcha_detection: + await self.detect_captcha(ctx) + @function async def get_text(self): """Get the text content (as markdown) of the current page""" @@ -270,3 +290,58 @@ async def back_to_top(self): await self.playwright.page.wait_for_timeout(300) return f"Successfully scrolled to top" + + async def detect_captcha(self, ctx: Context): + url = await self.get_page_url() + screenshot = await self.get_screenshot(full_page=True, max_size=(1280, 720)) + + async def handle_captcha(captcha_type: Literal["none", "captcha", "login"]): + """ + Handle the captcha detection result + + Args: + captcha_type: "none" if no captcha is detected, "captcha" if a captcha is detected, "login" if a login form is detected + """ + match captcha_type: + case "captcha": + await ctx.hitl.web_interaction( + tool_name=self.name, + message="Would you please help me solve the captcha?", + url=url, + ) + case "login": + await ctx.hitl.web_interaction( + tool_name=self.name, + message="Would you please help me login to the website?", + url=url, + ) + + return captcha_type + + res = await llm_tool_call( + llm=ctx.llm, + tool=handle_captcha, + messages=[ + ChatCompletionSystemMessageParam( + role="system", + content=dedent( + """ + You are given a screenshot of a webpage. Determine if a captcha or login form is present in the screenshot. If a captcha is present, call the tool with the argument "captcha". If a login form is present, call the tool with the argument "login". If neither is present, call the tool with the argument "none". + """ + ), + ), + ChatCompletionUserMessageParam( + role="user", + content=[ + { + "type": "image_url", + "image_url": { + "url": screenshot, + }, + } + ], + ), + ], + ) + + return await handle_captcha(**res.model_dump()) diff --git a/npiai/tools/scrapers/page_analyzer/app.py b/npiai/tools/scrapers/page_analyzer/app.py index 93e695e2..f4fcd371 100644 --- a/npiai/tools/scrapers/page_analyzer/app.py +++ b/npiai/tools/scrapers/page_analyzer/app.py @@ -36,6 +36,12 @@ class PageAnalyzer(BrowserTool): """ ) + force_captcha_detection: bool + + def __init__(self, force_captcha_detection: bool = False, **kwargs): + super().__init__(**kwargs) + self.force_captcha_detection = force_captcha_detection + async def _validate_pagination( self, ctx: Context, @@ -250,6 +256,7 @@ async def compute_common_selectors( @function async def support_infinite_scroll( self, + ctx: Context, url: str, items_selector: str | None = None, ) -> bool: @@ -257,14 +264,17 @@ async def support_infinite_scroll( Open the given URL and determine whether the page supports infinite scroll. Args: + ctx: NPi Context url: URL of the page items_selector: CSS selector of the items on the page """ # use long wait time for pages to be fully loaded await self.load_page( - url, - timeout=3000, + ctx=ctx, + url=url, + network_idle_timeout=3000, wait_for_selector=items_selector, + force_capcha_detection=self.force_captcha_detection, ) return await self.playwright.page.evaluate( @@ -340,7 +350,10 @@ async def support_infinite_scroll( @function async def get_pagination_button( - self, ctx: Context, url: str, items_selector: str | None = None + self, + ctx: Context, + url: str, + items_selector: str | None = None, ) -> str | None: """ Open the given URL and determine whether there is a pagination button. If there is, return the CSS selector of the pagination button. Otherwise, return None. @@ -350,7 +363,9 @@ async def get_pagination_button( url: URL of the page items_selector: CSS selector of the items on the page """ - await self.load_page(url) + await self.load_page( + ctx, url, force_capcha_detection=self.force_captcha_detection + ) # use latest page url in case of redirections page_url = await self.get_page_url() @@ -465,7 +480,10 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> ScrapingType: ctx: NPi Context url: URL of the page """ - await self.load_page(url) + await self.load_page( + ctx, url, force_capcha_detection=self.force_captcha_detection + ) + page_url = await self.get_page_url() page_title = await self.get_page_title() screenshot = await self.get_screenshot( @@ -546,7 +564,12 @@ async def infer_similar_items_selector( ctx: NPi Context url: URL of the page """ - await self.load_page(url, timeout=3000) + await self.load_page( + ctx, + url, + network_idle_timeout=3000, + force_capcha_detection=self.force_captcha_detection, + ) # use latest page url in case of redirections page_url = await self.get_page_url() diff --git a/npiai/tools/scrapers/web/__test__/interactive.py b/npiai/tools/scrapers/web/__test__/interactive.py index 6031002f..62f3e41d 100644 --- a/npiai/tools/scrapers/web/__test__/interactive.py +++ b/npiai/tools/scrapers/web/__test__/interactive.py @@ -5,13 +5,56 @@ # from npiai import Context from utils import auto_scrape +from npiai import HITL + + +class TestHITL(HITL): + async def confirm( + self, + tool_name: str, + message: str, + default=False, + ) -> bool: + print(f"[HITL] confirm: {message=}, {default=}") + return True + + async def input( + self, + tool_name: str, + message: str, + default="", + ) -> str: + print(f"[HITL] input: {message=}, {default=}") + return "input" + + async def select( + self, + tool_name: str, + message: str, + choices: list[str], + default="", + ) -> str: + print(f"[HITL] select: {message=}, {choices=}, {default=}") + return "select" + + async def web_interaction( + self, + tool_name: str, + message: str, + url: str, + ) -> str: + print(f"[HITL] web_interaction: {message=}, {url=}") + return "web_interaction" + async def main(): url = input("Enter the URL: ") + ctx = DebugContext() + ctx.use_hitl(TestHITL()) # url = "https://www.bardeen.ai/playbooks" await auto_scrape( - ctx=DebugContext(), + ctx=ctx, url=url, ) diff --git a/npiai/tools/scrapers/web/__test__/utils.py b/npiai/tools/scrapers/web/__test__/utils.py index 36171ed3..7fb115a6 100644 --- a/npiai/tools/scrapers/web/__test__/utils.py +++ b/npiai/tools/scrapers/web/__test__/utils.py @@ -13,7 +13,7 @@ async def auto_scrape( ctx: Context, url: str, ): - async with PageAnalyzer(headless=False) as analyzer: + async with PageAnalyzer(headless=False, force_captcha_detection=True) as analyzer: ancestor_selector = None items_selector = None @@ -46,6 +46,7 @@ async def auto_scrape( step_start_time = time.monotonic() infinite_scroll = await analyzer.support_infinite_scroll( + ctx=ctx, url=url, items_selector=items_selector, ) @@ -89,7 +90,7 @@ async def auto_scrape( stream = scraper.summarize_stream( ctx=ctx, output_columns=columns, - limit=1 if scraping_type == "single" else 100, + limit=1, batch_size=5, concurrency=10, ) diff --git a/npiai/tools/scrapers/web/app.py b/npiai/tools/scrapers/web/app.py index 0910a007..b8ef56f7 100644 --- a/npiai/tools/scrapers/web/app.py +++ b/npiai/tools/scrapers/web/app.py @@ -77,8 +77,9 @@ def get_matched_hashes(self) -> List[str]: async def init_data(self, ctx: Context): self._matched_hashes = [] await self.load_page( - self.url, - timeout=3000, + ctx=ctx, + url=self.url, + network_idle_timeout=3000, wait_for_selector=self.items_selector, ) @@ -102,6 +103,8 @@ async def next_items( res = await self._convert_items(count) if not res: + # if no items are found, check if there are any captcha + await self.detect_captcha(ctx) self._all_items_loaded = True return res diff --git a/npiai/tools/web/chromium/app.py b/npiai/tools/web/chromium/app.py index f74dc3b9..0abd2f55 100644 --- a/npiai/tools/web/chromium/app.py +++ b/npiai/tools/web/chromium/app.py @@ -31,13 +31,14 @@ def from_context(cls, ctx: Context) -> "Chromium": return cls() @function - async def goto(self, url: str): + async def goto(self, ctx: Context, url: str): """ Open the given URL in chromium. Args: + ctx: NPi Context url: The URL to open. """ - await self.load_page(url) + await self.load_page(ctx=ctx, url=url) return f"Opened {await self.get_page_url()}, page title: {await self.get_page_title()}"