Skip to content

Commit

Permalink
feat(BrowserTool): add captcha detection
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Jan 25, 2025
1 parent 57baf75 commit 0abd102
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 24 deletions.
Empty file added npiai/core/__test__/__init__.py
Empty file.
66 changes: 66 additions & 0 deletions npiai/core/__test__/captcha_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import asyncio

from npiai import BrowserTool, HITL
from npiai.utils.test_utils import DebugContext


class TestHITL(HITL):
async def confirm(
self,
tool_name: str,
message: str,
default=False,
) -> bool:
print(f"[HITL] confirm: {message=}, {default=}")
return True

async def input(
self,
tool_name: str,
message: str,
default="",
) -> str:
print(f"[HITL] input: {message=}, {default=}")
return "input"

async def select(
self,
tool_name: str,
message: str,
choices: list[str],
default="",
) -> str:
print(f"[HITL] select: {message=}, {choices=}, {default=}")
return "select"

async def web_interaction(
self,
tool_name: str,
message: str,
url: str,
) -> str:
print(f"[HITL] web_interaction: {message=}, {url=}")
return "web_interaction"


urls = [
"https://www.google.com/recaptcha/api2/demo",
"https://nopecha.com/captcha/turnstile",
"https://github.com/login",
"https://google.com",
]


async def main():
ctx = DebugContext()
ctx.use_hitl(TestHITL())

async with BrowserTool() as tool:
for url in urls:
await tool.load_page(ctx, url)
captcha_type = await tool.detect_captcha(ctx)
print(f"{url}: {captcha_type}")


if __name__ == "__main__":
asyncio.run(main())
10 changes: 9 additions & 1 deletion npiai/core/hitl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,12 @@ async def select(
message: str,
choices: List[str],
default="",
): ...
) -> str: ...

@abstractmethod
async def web_interaction(
self,
tool_name: str,
message: str,
url: str,
) -> str: ...
95 changes: 85 additions & 10 deletions npiai/core/tool/_browser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import base64
import io
from textwrap import dedent
from typing import Literal

from PIL import Image
from playwright.async_api import ElementHandle, TimeoutError

from litellm.types.completion import (
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
)

from npiai.context import Context
from npiai.core.browser import PlaywrightContext
from npiai.utils import html_to_markdown
from npiai.utils import html_to_markdown, llm_tool_call

from ._function import FunctionTool, function

Expand Down Expand Up @@ -34,27 +42,39 @@ def __init__(

async def load_page(
self,
ctx: Context,
url: str,
timeout: int | None = None,
wait_for_selector: str = None,
network_idle_timeout: int | None = None,
force_capcha_detection: bool = False,
):
await self.playwright.page.goto(url)

try:
if wait_for_selector is not None:
if wait_for_selector is not None:
try:
locator = self.playwright.page.locator(wait_for_selector)
await locator.first.wait_for(state="attached", timeout=timeout)
# wait for the page to become stable
elif timeout is not None:
await locator.first.wait_for(
state="attached", timeout=network_idle_timeout
)
except TimeoutError:
await self.detect_captcha(ctx)
# wait for the page to become stable
elif network_idle_timeout is not None:
try:
await self.playwright.page.wait_for_load_state(
"networkidle",
timeout=timeout,
timeout=network_idle_timeout,
)
except TimeoutError:
pass
except TimeoutError:
pass

# await self.playwright.page.wait_for_timeout(wait)

# capcha detection will be done (or unnecessary if elements matched) if selector is provided
# so we only do it if no selector is provided
if not wait_for_selector and force_capcha_detection:
await self.detect_captcha(ctx)

@function
async def get_text(self):
"""Get the text content (as markdown) of the current page"""
Expand Down Expand Up @@ -270,3 +290,58 @@ async def back_to_top(self):
await self.playwright.page.wait_for_timeout(300)

return f"Successfully scrolled to top"

async def detect_captcha(self, ctx: Context):
url = await self.get_page_url()
screenshot = await self.get_screenshot(full_page=True, max_size=(1280, 720))

async def handle_captcha(captcha_type: Literal["none", "captcha", "login"]):
"""
Handle the captcha detection result
Args:
captcha_type: "none" if no captcha is detected, "captcha" if a captcha is detected, "login" if a login form is detected
"""
match captcha_type:
case "captcha":
await ctx.hitl.web_interaction(
tool_name=self.name,
message="Would you please help me solve the captcha?",
url=url,
)
case "login":
await ctx.hitl.web_interaction(
tool_name=self.name,
message="Would you please help me login to the website?",
url=url,
)

return captcha_type

res = await llm_tool_call(
llm=ctx.llm,
tool=handle_captcha,
messages=[
ChatCompletionSystemMessageParam(
role="system",
content=dedent(
"""
You are given a screenshot of a webpage. Determine if a captcha or login form is present in the screenshot. If a captcha is present, call the tool with the argument "captcha". If a login form is present, call the tool with the argument "login". If neither is present, call the tool with the argument "none".
"""
),
),
ChatCompletionUserMessageParam(
role="user",
content=[
{
"type": "image_url",
"image_url": {
"url": screenshot,
},
}
],
),
],
)

return await handle_captcha(**res.model_dump())
35 changes: 29 additions & 6 deletions npiai/tools/scrapers/page_analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ class PageAnalyzer(BrowserTool):
"""
)

force_captcha_detection: bool

def __init__(self, force_captcha_detection: bool = False, **kwargs):
super().__init__(**kwargs)
self.force_captcha_detection = force_captcha_detection

async def _validate_pagination(
self,
ctx: Context,
Expand Down Expand Up @@ -250,21 +256,25 @@ async def compute_common_selectors(
@function
async def support_infinite_scroll(
self,
ctx: Context,
url: str,
items_selector: str | None = None,
) -> bool:
"""
Open the given URL and determine whether the page supports infinite scroll.
Args:
ctx: NPi Context
url: URL of the page
items_selector: CSS selector of the items on the page
"""
# use long wait time for pages to be fully loaded
await self.load_page(
url,
timeout=3000,
ctx=ctx,
url=url,
network_idle_timeout=3000,
wait_for_selector=items_selector,
force_capcha_detection=self.force_captcha_detection,
)

return await self.playwright.page.evaluate(
Expand Down Expand Up @@ -340,7 +350,10 @@ async def support_infinite_scroll(

@function
async def get_pagination_button(
self, ctx: Context, url: str, items_selector: str | None = None
self,
ctx: Context,
url: str,
items_selector: str | None = None,
) -> str | None:
"""
Open the given URL and determine whether there is a pagination button. If there is, return the CSS selector of the pagination button. Otherwise, return None.
Expand All @@ -350,7 +363,9 @@ async def get_pagination_button(
url: URL of the page
items_selector: CSS selector of the items on the page
"""
await self.load_page(url)
await self.load_page(
ctx, url, force_capcha_detection=self.force_captcha_detection
)

# use latest page url in case of redirections
page_url = await self.get_page_url()
Expand Down Expand Up @@ -465,7 +480,10 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> ScrapingType:
ctx: NPi Context
url: URL of the page
"""
await self.load_page(url)
await self.load_page(
ctx, url, force_capcha_detection=self.force_captcha_detection
)

page_url = await self.get_page_url()
page_title = await self.get_page_title()
screenshot = await self.get_screenshot(
Expand Down Expand Up @@ -546,7 +564,12 @@ async def infer_similar_items_selector(
ctx: NPi Context
url: URL of the page
"""
await self.load_page(url, timeout=3000)
await self.load_page(
ctx,
url,
network_idle_timeout=3000,
force_capcha_detection=self.force_captcha_detection,
)

# use latest page url in case of redirections
page_url = await self.get_page_url()
Expand Down
45 changes: 44 additions & 1 deletion npiai/tools/scrapers/web/__test__/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,56 @@
# from npiai import Context
from utils import auto_scrape

from npiai import HITL


class TestHITL(HITL):
async def confirm(
self,
tool_name: str,
message: str,
default=False,
) -> bool:
print(f"[HITL] confirm: {message=}, {default=}")
return True

async def input(
self,
tool_name: str,
message: str,
default="",
) -> str:
print(f"[HITL] input: {message=}, {default=}")
return "input"

async def select(
self,
tool_name: str,
message: str,
choices: list[str],
default="",
) -> str:
print(f"[HITL] select: {message=}, {choices=}, {default=}")
return "select"

async def web_interaction(
self,
tool_name: str,
message: str,
url: str,
) -> str:
print(f"[HITL] web_interaction: {message=}, {url=}")
return "web_interaction"


async def main():
url = input("Enter the URL: ")
ctx = DebugContext()
ctx.use_hitl(TestHITL())
# url = "https://www.bardeen.ai/playbooks"

await auto_scrape(
ctx=DebugContext(),
ctx=ctx,
url=url,
)

Expand Down
5 changes: 3 additions & 2 deletions npiai/tools/scrapers/web/__test__/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async def auto_scrape(
ctx: Context,
url: str,
):
async with PageAnalyzer(headless=False) as analyzer:
async with PageAnalyzer(headless=False, force_captcha_detection=True) as analyzer:
ancestor_selector = None
items_selector = None

Expand Down Expand Up @@ -46,6 +46,7 @@ async def auto_scrape(
step_start_time = time.monotonic()

infinite_scroll = await analyzer.support_infinite_scroll(
ctx=ctx,
url=url,
items_selector=items_selector,
)
Expand Down Expand Up @@ -89,7 +90,7 @@ async def auto_scrape(
stream = scraper.summarize_stream(
ctx=ctx,
output_columns=columns,
limit=1 if scraping_type == "single" else 100,
limit=1,
batch_size=5,
concurrency=10,
)
Expand Down
Loading

0 comments on commit 0abd102

Please sign in to comment.