Skip to content

Commit

Permalink
refactor(page_analyzer): use most contentful elements to infer selectors
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 18, 2024
1 parent 6b431c5 commit 9064abe
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 91 deletions.
2 changes: 1 addition & 1 deletion npiai/core/browser/_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
FileChooser,
)

__BROWSER_UTILS_VERSION__ = "0.0.4"
__BROWSER_UTILS_VERSION__ = "0.0.8"


def _prepare_browser_utils():
Expand Down
11 changes: 2 additions & 9 deletions npiai/core/tool/_browser.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
import base64

from markdownify import MarkdownConverter
from playwright.async_api import ElementHandle, Error

from npiai.core.browser import PlaywrightContext
from npiai.utils import logger
from npiai.utils import logger, html_to_markdown

from ._function import FunctionTool, function


class MdConverter(MarkdownConverter):
# skip <noscript> tags
def convert_noscript(self, _el, _text, _convert_as_inline):
return ""


class BrowserTool(FunctionTool):
use_screenshot: bool
playwright: PlaywrightContext
Expand All @@ -41,7 +34,7 @@ def __init__(
async def get_text(self):
"""Get the text content (as markdown) of the current page"""
html = await self.playwright.page.evaluate("() => document.body.innerHTML")
return MdConverter().convert(html)
return html_to_markdown(html)

async def start(self):
"""Start the Browser App"""
Expand Down
12 changes: 10 additions & 2 deletions npiai/tools/web/page_analyzer/__test__/full_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,20 @@
"https://www.google.com/search?q=test&hl=ja",
"https://www.amazon.com/s?k=test",
"https://github.com/facebook/react/issues",
"https://github.com/facebook/react/issues/31207",
"https://www.amazon.co.jp/product-reviews/B0BX2C4WYX/",
"https://news.ycombinator.com/item?id=41853810",
"https://x.com/home",
]


async def main():
ctx = Context()
async with PageAnalyzer(headless=False) as analyzer:
# with open(".cache/twitter_state.json") as f:
# state = json.load(f)
# await analyzer.playwright.context.add_cookies(state["cookies"])

for url in urls:
print(f"Analyzing {url}:")

Expand All @@ -45,11 +53,11 @@ async def main():
print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
anchors = await analyzer.get_similar_items(ctx, url)
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(anchors, indent=2), " ").lstrip(),
indent(json.dumps(selectors, indent=2), " ").lstrip(),
)

print()
Expand Down
88 changes: 37 additions & 51 deletions npiai/tools/web/page_analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from textwrap import dedent
from typing import Literal, List
from typing_extensions import TypedDict
from playwright.async_api import TimeoutError


from litellm.types.completion import (
Expand All @@ -11,7 +12,7 @@


from npiai import BrowserTool, function, Context
from npiai.utils import llm_tool_call
from npiai.utils import llm_tool_call, html_to_markdown

_ScrapingType = Literal["list-like", "single"]

Expand All @@ -33,7 +34,13 @@ class PageAnalyzer(BrowserTool):

async def _load_page(self, url: str, wait: int = 1000):
await self.playwright.page.goto(url)

# wait for the page to become stable
try:
await self.playwright.page.wait_for_load_state("networkidle", timeout=3000)
except TimeoutError:
pass

await self.playwright.page.wait_for_timeout(wait)

async def _validate_pagination(self, ctx: Context, selector: str) -> bool:
Expand Down Expand Up @@ -174,37 +181,26 @@ async def compute_common_selectors(
Args:
anchor_ids: An array of IDs of the elements that are similar to each other and represent a meaningful list of items.
"""
# print("anchor_ids:", anchor_ids)
print("anchor_ids:", anchor_ids)

if not anchor_ids:
return None

# extract the first 3 elements and expand their anchors
# extract the first 3 elements
# to find common items and ancestor selector
return await self.playwright.page.evaluate(
"""(anchorIds) => {
try {
const anchorElements = anchorIds.map(id => npi.getElement(id));
const expandedAnchors = new Set(anchorElements.flatMap(el => {
return npi.selectorUtils.expandAnchorFrom(el) || [];
}));
let selectors;
if (expandedAnchors.size >= 2) {
selectors = npi.selectorUtils.getCommonItemsAndAncestor(...expandedAnchors);
} else {
selectors = npi.selectorUtils.getCommonItemsAndAncestor(...anchorElements);
}
const selectors = npi.selectorUtils.getCommonItemsAndAncestor(...anchorElements);
if (!selectors) {
return null;
}
const splitSelectors = selectors.items.split(' ');
const lastSelector = splitSelectors.at(-1);
const isDirectChildren = splitSelectors.at(-2) === '>';
if (!lastSelector) {
return null;
Expand All @@ -219,20 +215,9 @@ async def compute_common_selectors(
return null;
}
const matches = [...document.querySelectorAll(selectors.items)];
if (matches.length < 3 || matches.length > 1000) {
return null;
}
const anchorsSelector = matches
.slice(0, 3)
.map(el => npi.getUniqueSelector(el))
.join(", ");
return {
...selectors,
anchors: anchorsSelector,
anchors: anchorElements.map(el => npi.getUniqueSelector(el)).join(', '),
}
} catch {
return null;
Expand Down Expand Up @@ -461,7 +446,11 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> _ScrapingType:
return await self.set_scraping_type(**res.model_dump())

@function
async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | None:
async def infer_similar_items_selector(
self,
ctx: Context,
url: str,
) -> CommonSelectors | None:
"""
Open the given URL and determine whether there are similar elements representing a meaningful list of items. If there are, return the common selector of the similar elements, the ancestor selector, and the selectors of the anchor elements. Otherwise, return None.
Expand All @@ -475,22 +464,25 @@ async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | N
page_url = await self.get_page_url()
page_title = await self.get_page_title()
raw_screenshot = await self.get_screenshot(full_page=True)
elements, _ = await self.get_interactive_elements(
screenshot=raw_screenshot,
full_page=True,

contentful_elements = await self.playwright.page.evaluate(
"""
(screenshot) => npi.getMostContentfulElements(screenshot)
""",
raw_screenshot,
)
annotated_screenshot = await self.get_screenshot(full_page=True)

filtered_elements = []
annotated_screenshot = await self.get_screenshot(full_page=True)

for elem in elements:
if elem["role"] != "button" and (
len(elem["accessibleName"]) > 10
or len(elem["accessibleDescription"]) > 10
):
filtered_elements.append(elem)
elements_as_markdown = []

# print("filtered_elements:", filtered_elements)
for el in contentful_elements:
elements_as_markdown.append(
{
"id": el["id"],
"content": html_to_markdown(el["html"]),
}
)

res = await llm_tool_call(
llm=ctx.llm,
Expand All @@ -507,28 +499,22 @@ async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | N
- An annotated screenshot of the target page where the interactive elements are surrounded with rectangular bounding boxes in different colors. At the top left of each bounding box is a small rectangle in the same color as the bounding box. This is the label and it contains a number indicating the ID of that box. The label number starts from 0.
- The URL of the page.
- The title of the page.
- An array of the interactive elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out.
- An array of the most contextful elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out.
## Element Object
The original HTML elements are described as the following JSON objects:
type Element = {
id: string; // The Marker ID of the element
tag: string; // The tag of the element
role: string | null; // The WAI-ARIA accessible role of the element
accessibleName: string; // The WAI-ARIA accessible name of the element
accessibleDescription: string; // The WAI-ARIA accessible description of the element
attributes: Record<string, string>; // Some helpful attributes of the element
options?: string[]; // Available options of an <select> element. This property is only provided when the element is a <select> element.
content: string; // The content of the element in Markdown format
}
## Instructions
Follow the instructions to determine whether there is a pagination button on the current page for navigating to the next page:
1. Examine the screenshots, the URL, and the title of the page to understand the context, and then think about what the current page is.
2. Go through the elements array, pay attention to the `role`, `accessibleName`, and `accessibleDescription` properties to grab semantic information of the elements.
3. Check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
2. Go through the elements array, check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
4. If you find meaningful similar elements, call the tool with a list of the IDs of the elements to compute the common selectors. Otherwise, call the tool with an empty list.
"""
),
Expand All @@ -542,7 +528,7 @@ async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | N
{
"url": page_url,
"title": page_title,
"elements": filtered_elements,
"elements": elements_as_markdown,
},
ensure_ascii=False,
),
Expand Down
29 changes: 1 addition & 28 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,48 +5,21 @@
from typing_extensions import TypedDict, Annotated
from textwrap import dedent

from markdownify import MarkdownConverter
from litellm.types.completion import (
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
)

from npiai import function, BrowserTool, Context
from npiai.core import NavigatorAgent
from npiai.utils import is_cloud_env, llm_tool_call
from npiai.utils import is_cloud_env, llm_tool_call, html_to_markdown


class Column(TypedDict):
name: Annotated[str, "Name of the column"]
description: Annotated[str | None, "Brief description of the column"]


class NonBase64ImageConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
src = el.attrs.get("src", "")

if not src:
return ""

if src.startswith("data:image"):
el.attrs["src"] = "<base64_image>"

return super().convert_img(el, text, convert_as_inline)

# def convert_div(self, el, text, convert_as_inline):
# if text:
# text = text.strip("\n")
#
# if convert_as_inline or not text:
# return text
#
# return f"{text}\n"


def html_to_markdown(html: str, **options) -> str:
return NonBase64ImageConverter(**options).convert(html).strip()


class Scraper(BrowserTool):
name = "scraper"
description = (
Expand Down
2 changes: 2 additions & 0 deletions npiai/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .parse_json_response import parse_json_response
from .llm_tool_call import llm_tool_call
from .parse_npi_function import parse_npi_function
from .html_to_markdown import html_to_markdown

__all__ = [
"logger",
Expand All @@ -20,4 +21,5 @@
"parse_json_response",
"llm_tool_call",
"parse_npi_function",
"html_to_markdown",
]
30 changes: 30 additions & 0 deletions npiai/utils/html_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from markdownify import MarkdownConverter


class CompactConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
src = el.attrs.get("src", "")

if not src:
return ""

if src.startswith("data:image"):
el.attrs["src"] = "<base64_image>"

return super().convert_img(el, text, convert_as_inline)

def convert_noscript(self, _el, _text, _convert_as_inline):
return ""

# def convert_div(self, el, text, convert_as_inline):
# if text:
# text = text.strip("\n")
#
# if convert_as_inline or not text:
# return text
#
# return f"{text}\n"


def html_to_markdown(html: str, **options) -> str:
return CompactConverter(**options).convert(html).strip()

0 comments on commit 9064abe

Please sign in to comment.