refactor(page_analyzer): use most contentful elements to infer selectors

npi-ai · Oct 18, 2024 · 9064abe · 9064abe
1 parent 6b431c5
commit 9064abe
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 91 deletions.
diff --git a/npiai/core/browser/_playwright.py b/npiai/core/browser/_playwright.py
@@ -12,7 +12,7 @@
     FileChooser,
 )
 
-__BROWSER_UTILS_VERSION__ = "0.0.4"
+__BROWSER_UTILS_VERSION__ = "0.0.8"
 
 
 def _prepare_browser_utils():

diff --git a/npiai/core/tool/_browser.py b/npiai/core/tool/_browser.py
@@ -1,20 +1,13 @@
 import base64
 
-from markdownify import MarkdownConverter
 from playwright.async_api import ElementHandle, Error
 
 from npiai.core.browser import PlaywrightContext
-from npiai.utils import logger
+from npiai.utils import logger, html_to_markdown
 
 from ._function import FunctionTool, function
 
 
-class MdConverter(MarkdownConverter):
-    # skip <noscript> tags
-    def convert_noscript(self, _el, _text, _convert_as_inline):
-        return ""
-
-
 class BrowserTool(FunctionTool):
     use_screenshot: bool
     playwright: PlaywrightContext
@@ -41,7 +34,7 @@ def __init__(
     async def get_text(self):
         """Get the text content (as markdown) of the current page"""
         html = await self.playwright.page.evaluate("() => document.body.innerHTML")
-        return MdConverter().convert(html)
+        return html_to_markdown(html)
 
     async def start(self):
         """Start the Browser App"""

diff --git a/npiai/tools/web/page_analyzer/__test__/full_tests.py b/npiai/tools/web/page_analyzer/__test__/full_tests.py
@@ -15,12 +15,20 @@
     "https://www.google.com/search?q=test&hl=ja",
     "https://www.amazon.com/s?k=test",
     "https://github.com/facebook/react/issues",
+    "https://github.com/facebook/react/issues/31207",
+    "https://www.amazon.co.jp/product-reviews/B0BX2C4WYX/",
+    "https://news.ycombinator.com/item?id=41853810",
+    "https://x.com/home",
 ]
 
 
 async def main():
     ctx = Context()
     async with PageAnalyzer(headless=False) as analyzer:
+        # with open(".cache/twitter_state.json") as f:
+        #     state = json.load(f)
+        #     await analyzer.playwright.context.add_cookies(state["cookies"])
+
         for url in urls:
             print(f"Analyzing {url}:")
 
@@ -45,11 +53,11 @@ async def main():
             print("  - Inferred scraping type:", scraping_type)
 
             if scraping_type == "list-like":
-                anchors = await analyzer.get_similar_items(ctx, url)
+                selectors = await analyzer.infer_similar_items_selector(ctx, url)
 
                 print(
                     "  - Possible selectors:",
-                    indent(json.dumps(anchors, indent=2), "    ").lstrip(),
+                    indent(json.dumps(selectors, indent=2), "    ").lstrip(),
                 )
 
             print()

diff --git a/npiai/tools/web/page_analyzer/app.py b/npiai/tools/web/page_analyzer/app.py
@@ -2,6 +2,7 @@
 from textwrap import dedent
 from typing import Literal, List
 from typing_extensions import TypedDict
+from playwright.async_api import TimeoutError
 
 
 from litellm.types.completion import (
@@ -11,7 +12,7 @@
 
 
 from npiai import BrowserTool, function, Context
-from npiai.utils import llm_tool_call
+from npiai.utils import llm_tool_call, html_to_markdown
 
 _ScrapingType = Literal["list-like", "single"]
 
@@ -33,7 +34,13 @@ class PageAnalyzer(BrowserTool):
 
     async def _load_page(self, url: str, wait: int = 1000):
         await self.playwright.page.goto(url)
+
         # wait for the page to become stable
+        try:
+            await self.playwright.page.wait_for_load_state("networkidle", timeout=3000)
+        except TimeoutError:
+            pass
+
         await self.playwright.page.wait_for_timeout(wait)
 
     async def _validate_pagination(self, ctx: Context, selector: str) -> bool:
@@ -174,37 +181,26 @@ async def compute_common_selectors(
         Args:
             anchor_ids: An array of IDs of the elements that are similar to each other and represent a meaningful list of items.
         """
-        # print("anchor_ids:", anchor_ids)
+        print("anchor_ids:", anchor_ids)
 
         if not anchor_ids:
             return None
 
-        # extract the first 3 elements and expand their anchors
+        # extract the first 3 elements
         # to find common items and ancestor selector
         return await self.playwright.page.evaluate(
             """(anchorIds) => {
                 try {
                     const anchorElements = anchorIds.map(id => npi.getElement(id));
-                    
-                    const expandedAnchors = new Set(anchorElements.flatMap(el => {
-                        return npi.selectorUtils.expandAnchorFrom(el) || [];
-                    }));
-                    
-                    let selectors;
-                    
-                    if (expandedAnchors.size >= 2) {
-                        selectors = npi.selectorUtils.getCommonItemsAndAncestor(...expandedAnchors);
-                    } else {
-                        selectors = npi.selectorUtils.getCommonItemsAndAncestor(...anchorElements);
-                    }
+                                        
+                    const selectors = npi.selectorUtils.getCommonItemsAndAncestor(...anchorElements);
                     
                     if (!selectors) {
                         return null;
                     }
                     
                     const splitSelectors = selectors.items.split(' ');
                     const lastSelector = splitSelectors.at(-1);
-                    const isDirectChildren = splitSelectors.at(-2) === '>';
                     
                     if (!lastSelector) {
                         return null;
@@ -219,20 +215,9 @@ async def compute_common_selectors(
                       return null;
                     }
                     
-                    const matches = [...document.querySelectorAll(selectors.items)];
-                    
-                    if (matches.length < 3 || matches.length > 1000) {
-                        return null;
-                    }
-                    
-                    const anchorsSelector = matches
-                        .slice(0, 3)
-                        .map(el => npi.getUniqueSelector(el))
-                        .join(", ");
-                    
                     return {
                         ...selectors,
-                        anchors: anchorsSelector,
+                        anchors: anchorElements.map(el => npi.getUniqueSelector(el)).join(', '),
                     }
                 } catch {
                     return null;
@@ -461,7 +446,11 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> _ScrapingType:
         return await self.set_scraping_type(**res.model_dump())
 
     @function
-    async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | None:
+    async def infer_similar_items_selector(
+        self,
+        ctx: Context,
+        url: str,
+    ) -> CommonSelectors | None:
         """
         Open the given URL and determine whether there are similar elements representing a meaningful list of items. If there are, return the common selector of the similar elements, the ancestor selector, and the selectors of the anchor elements. Otherwise, return None.
 
@@ -475,22 +464,25 @@ async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | N
         page_url = await self.get_page_url()
         page_title = await self.get_page_title()
         raw_screenshot = await self.get_screenshot(full_page=True)
-        elements, _ = await self.get_interactive_elements(
-            screenshot=raw_screenshot,
-            full_page=True,
+
+        contentful_elements = await self.playwright.page.evaluate(
+            """
+            (screenshot) => npi.getMostContentfulElements(screenshot)
+            """,
+            raw_screenshot,
         )
-        annotated_screenshot = await self.get_screenshot(full_page=True)
 
-        filtered_elements = []
+        annotated_screenshot = await self.get_screenshot(full_page=True)
 
-        for elem in elements:
-            if elem["role"] != "button" and (
-                len(elem["accessibleName"]) > 10
-                or len(elem["accessibleDescription"]) > 10
-            ):
-                filtered_elements.append(elem)
+        elements_as_markdown = []
 
-        # print("filtered_elements:", filtered_elements)
+        for el in contentful_elements:
+            elements_as_markdown.append(
+                {
+                    "id": el["id"],
+                    "content": html_to_markdown(el["html"]),
+                }
+            )
 
         res = await llm_tool_call(
             llm=ctx.llm,
@@ -507,28 +499,22 @@ async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | N
                         - An annotated screenshot of the target page where the interactive elements are surrounded with rectangular bounding boxes in different colors. At the top left of each bounding box is a small rectangle in the same color as the bounding box. This is the label and it contains a number indicating the ID of that box. The label number starts from 0.
                         - The URL of the page.
                         - The title of the page.
-                        - An array of the interactive elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out.
+                        - An array of the most contextful elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out.
                         
                         ## Element Object
 
                         The original HTML elements are described as the following JSON objects:
                         
                         type Element = {
                           id: string; // The Marker ID of the element
-                          tag: string; // The tag of the element
-                          role: string | null; // The WAI-ARIA accessible role of the element
-                          accessibleName: string; // The WAI-ARIA accessible name of the element
-                          accessibleDescription: string; // The WAI-ARIA accessible description of the element
-                          attributes: Record<string, string>; // Some helpful attributes of the element
-                          options?: string[]; // Available options of an <select> element. This property is only provided when the element is a <select> element.
+                          content: string; // The content of the element in Markdown format
                         }
                         
                         ## Instructions
                         
                         Follow the instructions to determine whether there is a pagination button on the current page for navigating to the next page:
                         1. Examine the screenshots, the URL, and the title of the page to understand the context, and then think about what the current page is.
-                        2. Go through the elements array, pay attention to the `role`, `accessibleName`, and `accessibleDescription` properties to grab semantic information of the elements.
-                        3. Check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
+                        2. Go through the elements array, check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
                         4. If you find meaningful similar elements, call the tool with a list of the IDs of the elements to compute the common selectors. Otherwise, call the tool with an empty list.
                         """
                     ),
@@ -542,7 +528,7 @@ async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | N
                                 {
                                     "url": page_url,
                                     "title": page_title,
-                                    "elements": filtered_elements,
+                                    "elements": elements_as_markdown,
                                 },
                                 ensure_ascii=False,
                             ),

diff --git a/npiai/tools/web/scraper/app.py b/npiai/tools/web/scraper/app.py
@@ -5,48 +5,21 @@
 from typing_extensions import TypedDict, Annotated
 from textwrap import dedent
 
-from markdownify import MarkdownConverter
 from litellm.types.completion import (
     ChatCompletionSystemMessageParam,
     ChatCompletionUserMessageParam,
 )
 
 from npiai import function, BrowserTool, Context
 from npiai.core import NavigatorAgent
-from npiai.utils import is_cloud_env, llm_tool_call
+from npiai.utils import is_cloud_env, llm_tool_call, html_to_markdown
 
 
 class Column(TypedDict):
     name: Annotated[str, "Name of the column"]
     description: Annotated[str | None, "Brief description of the column"]
 
 
-class NonBase64ImageConverter(MarkdownConverter):
-    def convert_img(self, el, text, convert_as_inline):
-        src = el.attrs.get("src", "")
-
-        if not src:
-            return ""
-
-        if src.startswith("data:image"):
-            el.attrs["src"] = "<base64_image>"
-
-        return super().convert_img(el, text, convert_as_inline)
-
-    # def convert_div(self, el, text, convert_as_inline):
-    #     if text:
-    #         text = text.strip("\n")
-    #
-    #     if convert_as_inline or not text:
-    #         return text
-    #
-    #     return f"{text}\n"
-
-
-def html_to_markdown(html: str, **options) -> str:
-    return NonBase64ImageConverter(**options).convert(html).strip()
-
-
 class Scraper(BrowserTool):
     name = "scraper"
     description = (

diff --git a/npiai/utils/__init__.py b/npiai/utils/__init__.py
@@ -8,6 +8,7 @@
 from .parse_json_response import parse_json_response
 from .llm_tool_call import llm_tool_call
 from .parse_npi_function import parse_npi_function
+from .html_to_markdown import html_to_markdown
 
 __all__ = [
     "logger",
@@ -20,4 +21,5 @@
     "parse_json_response",
     "llm_tool_call",
     "parse_npi_function",
+    "html_to_markdown",
 ]
diff --git a/npiai/utils/html_to_markdown.py b/npiai/utils/html_to_markdown.py
@@ -0,0 +1,30 @@
+from markdownify import MarkdownConverter
+
+
+class CompactConverter(MarkdownConverter):
+    def convert_img(self, el, text, convert_as_inline):
+        src = el.attrs.get("src", "")
+
+        if not src:
+            return ""
+
+        if src.startswith("data:image"):
+            el.attrs["src"] = "<base64_image>"
+
+        return super().convert_img(el, text, convert_as_inline)
+
+    def convert_noscript(self, _el, _text, _convert_as_inline):
+        return ""
+
+    # def convert_div(self, el, text, convert_as_inline):
+    #     if text:
+    #         text = text.strip("\n")
+    #
+    #     if convert_as_inline or not text:
+    #         return text
+    #
+    #     return f"{text}\n"
+
+
+def html_to_markdown(html: str, **options) -> str:
+    return CompactConverter(**options).convert(html).strip()