feat(analyzer): support items_selector in infinite scroll detection

npi-ai · Oct 23, 2024 · 267e013 · 267e013
1 parent 444c805
commit 267e013
Showing 4 changed files with 122 additions and 139 deletions.
diff --git a/npiai/tools/web/page_analyzer/app.py b/npiai/tools/web/page_analyzer/app.py
@@ -2,6 +2,7 @@
 from textwrap import dedent
 from typing import Literal, List
 from typing_extensions import TypedDict
+from playwright.async_api import Error as PlaywrightError
 
 
 from litellm.types.completion import (
@@ -51,7 +52,12 @@ async def _validate_pagination(self, ctx: Context, selector: str) -> bool:
         old_title = await self.get_page_title()
 
         await self.clear_bboxes()
-        await self.click(elem)
+
+        try:
+            await self.click(elem)
+        except PlaywrightError:
+            return False
+
         await self.playwright.page.wait_for_timeout(3000)
 
         new_screenshot = await self.get_screenshot(full_page=True)
@@ -92,7 +98,7 @@ def callback(is_next_page: bool):
                         2. Compare the old URL and the new URL to see if the page is navigated to the next page.
                         3. Compare the old title and the new title to see the two pages are related.
                         4. Compare the first screenshot (the screenshot before clicking the pagination button) with the second screenshot (the screenshot after clicking the pagination button) to see if there are any differences. 
-                        5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working.
+                        5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working. Note that opening or closing a popup/modal in the same page is not considered as pagination.
                         6. If the pagination button is working, call the tool with `true`. Otherwise, call the tool with `false`.
                         """
                     ),
@@ -161,7 +167,8 @@ async def get_selector_of_marker(self, marker_id: int = -1) -> str | None:
         )
 
     async def compute_common_selectors(
-        self, anchor_ids: List[int]
+        self,
+        anchor_ids: List[int],
     ) -> CommonSelectors | None:
         """
         Expand the anchors with the given IDs and compute the common items and ancestor selector.
@@ -216,24 +223,35 @@ async def compute_common_selectors(
         )
 
     @function
-    async def support_infinite_scroll(self, url: str) -> bool:
+    async def support_infinite_scroll(
+        self,
+        url: str,
+        items_selector: str = None,
+    ) -> bool:
         """
         Open the given URL and determine whether the page supports infinite scroll.
 
         Args:
             url: URL of the page
+            items_selector: CSS selector of the items on the page
         """
         # use long wait time for pages to be fully loaded
         await self.load_page(url, wait=3000)
 
         return await self.playwright.page.evaluate(
             """
-            () => {
+            (items_selector) => {
                 let mutateElementsCount = 0;
-                const threshold = 10;
+                const threshold = items_selector === '*' ? 10 : 3;
                 
                 const npiScrollObserver = new MutationObserver((records) => {
-                    mutateElementsCount += records.length;
+                    for (const record of records) {
+                        for (const node of record.addedNodes) {
+                            if (node.nodeType === Node.ELEMENT_NODE && node.matches(items_selector)) {
+                                mutateElementsCount++;
+                            }
+                        }
+                    }
                 });
                 
                 npiScrollObserver.observe(
@@ -272,6 +290,7 @@ async def support_infinite_scroll(self, url: str) -> bool:
                 });
             }
             """,
+            items_selector or "*",
         )
 
     @function

diff --git a/npiai/tools/web/scraper/__test__/interactive.py b/npiai/tools/web/scraper/__test__/interactive.py
@@ -1,80 +1,25 @@
 import asyncio
-import json
-from textwrap import indent
-
-from npiai.tools.web.scraper import Scraper
-from npiai.tools.web.page_analyzer import PageAnalyzer
 
 # from npiai.utils.test_utils import DebugContext
 from npiai import Context
+from npiai.tools.web.page_analyzer import PageAnalyzer
+from npiai.tools.web.scraper import Scraper
+from utils import autos_scrape
 
 
 async def main():
     url = input("Enter the URL: ")
-    ctx = Context()
 
     async with PageAnalyzer(headless=False) as analyzer:
         scraper = Scraper(batch_size=10, playwright=analyzer.playwright)
 
-        print(f"Analyzing {url}:")
-
-        infinite_scroll = await analyzer.support_infinite_scroll(
-            url=url,
-        )
-
-        print("  - Support infinite scroll:", infinite_scroll)
-
-        pagination = await analyzer.get_pagination_button(
-            ctx=ctx,
-            url=url,
-        )
-
-        print("  - Pagination button:", pagination)
-
-        scraping_type = await analyzer.infer_scraping_type(
-            ctx=ctx,
+        await autos_scrape(
+            ctx=Context(),
+            analyzer=analyzer,
+            scraper=scraper,
             url=url,
         )
 
-        print("  - Inferred scraping type:", scraping_type)
-
-        if scraping_type == "list-like":
-            selectors = await analyzer.infer_similar_items_selector(ctx, url)
-
-            print(
-                "  - Possible selectors:",
-                indent(json.dumps(selectors, indent=2), prefix="    ").lstrip(),
-            )
-
-        if not selectors:
-            return
-
-        columns = await scraper.infer_columns(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-        )
-
-        print(
-            "  - Inferred columns:",
-            indent(json.dumps(columns, indent=2), prefix="   ").lstrip(),
-        )
-
-        stream = scraper.summarize_stream(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-            output_columns=columns,
-            limit=10,
-        )
-
-        async for items in stream:
-            print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))
-
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/npiai/tools/web/scraper/__test__/twitter.py b/npiai/tools/web/scraper/__test__/twitter.py
@@ -1,83 +1,22 @@
 import asyncio
-import json
-from textwrap import indent
-
-from npiai.tools.web.scraper import Scraper
-from npiai.tools.web.page_analyzer import PageAnalyzer
-from npiai.tools.web.twitter import Twitter
 
 # from npiai.utils.test_utils import DebugContext
 from npiai import Context
-
-url = "https://x.com/home"
+from npiai.tools.web.page_analyzer import PageAnalyzer
+from npiai.tools.web.scraper import Scraper
+from npiai.tools.web.twitter import Twitter
+from utils import autos_scrape
 
 
 async def main():
-    ctx = Context()
-
     async with Twitter(headless=False) as twitter:
-        analyzer = PageAnalyzer(playwright=twitter.playwright)
-        scraper = Scraper(batch_size=10, playwright=twitter.playwright)
-
-        print(f"Analyzing {url}:")
-
-        infinite_scroll = await analyzer.support_infinite_scroll(
-            url=url,
-        )
-
-        print("  - Support infinite scroll:", infinite_scroll)
-
-        pagination = await analyzer.get_pagination_button(
-            ctx=ctx,
-            url=url,
+        await autos_scrape(
+            ctx=Context(),
+            analyzer=PageAnalyzer(playwright=twitter.playwright),
+            scraper=Scraper(batch_size=10, playwright=twitter.playwright),
+            url="https://x.com/home",
         )
 
-        print("  - Pagination button:", pagination)
-
-        scraping_type = await analyzer.infer_scraping_type(
-            ctx=ctx,
-            url=url,
-        )
-
-        print("  - Inferred scraping type:", scraping_type)
-
-        if scraping_type == "list-like":
-            selectors = await analyzer.infer_similar_items_selector(ctx, url)
-
-            print(
-                "  - Possible selectors:",
-                indent(json.dumps(selectors, indent=2), prefix="    ").lstrip(),
-            )
-
-        if not selectors:
-            return
-
-        columns = await scraper.infer_columns(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-        )
-
-        print(
-            "  - Inferred columns:",
-            indent(json.dumps(columns, indent=2), prefix="   ").lstrip(),
-        )
-
-        stream = scraper.summarize_stream(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-            output_columns=columns,
-            limit=10,
-        )
-
-        async for items in stream:
-            print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))
-
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/npiai/tools/web/scraper/__test__/utils.py b/npiai/tools/web/scraper/__test__/utils.py
@@ -0,0 +1,80 @@
+import json
+from textwrap import indent
+
+from npiai.tools.web.scraper import Scraper
+from npiai.tools.web.page_analyzer import PageAnalyzer
+
+# from npiai.utils.test_utils import DebugContext
+from npiai import Context
+
+
+async def autos_scrape(
+    ctx: Context,
+    analyzer: PageAnalyzer,
+    scraper: Scraper,
+    url: str,
+):
+    ancestor_selector = None
+    items_selector = None
+
+    print(f"Analyzing {url}:")
+
+    scraping_type = await analyzer.infer_scraping_type(
+        ctx=ctx,
+        url=url,
+    )
+
+    print("  - Inferred scraping type:", scraping_type)
+
+    if scraping_type == "list-like":
+        selectors = await analyzer.infer_similar_items_selector(ctx, url)
+
+        print(
+            "  - Possible selectors:",
+            indent(json.dumps(selectors, indent=2), prefix="    ").lstrip(),
+        )
+
+        if selectors:
+            ancestor_selector = selectors["ancestor"]
+            items_selector = selectors["items"]
+
+    infinite_scroll = await analyzer.support_infinite_scroll(
+        url=url,
+        items_selector=items_selector,
+    )
+
+    print("  - Support infinite scroll:", infinite_scroll)
+
+    pagination_button_selector = await analyzer.get_pagination_button(
+        ctx=ctx,
+        url=url,
+    )
+
+    print("  - Pagination button:", pagination_button_selector)
+
+    columns = await scraper.infer_columns(
+        ctx=ctx,
+        url=url,
+        scraping_type=scraping_type,
+        ancestor_selector=ancestor_selector,
+        items_selector=items_selector,
+    )
+
+    print(
+        "  - Inferred columns:",
+        indent(json.dumps(columns, indent=2), prefix="   ").lstrip(),
+    )
+
+    stream = scraper.summarize_stream(
+        ctx=ctx,
+        url=url,
+        scraping_type=scraping_type,
+        ancestor_selector=ancestor_selector,
+        items_selector=items_selector,
+        pagination_button_selector=pagination_button_selector,
+        output_columns=columns,
+        limit=1 if scraping_type == "single" else 10,
+    )
+
+    async for items in stream:
+        print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))