feat(analyzer): support items_selector in infinite scroll detection

npi-ai · Oct 21, 2024 · 8c4976a · 8c4976a
1 parent 5600f7a
commit 8c4976a
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 136 deletions.
diff --git a/npiai/tools/web/page_analyzer/app.py b/npiai/tools/web/page_analyzer/app.py
@@ -92,7 +92,7 @@ def callback(is_next_page: bool):
                         2. Compare the old URL and the new URL to see if the page is navigated to the next page.
                         3. Compare the old title and the new title to see the two pages are related.
                         4. Compare the first screenshot (the screenshot before clicking the pagination button) with the second screenshot (the screenshot after clicking the pagination button) to see if there are any differences. 
-                        5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working.
+                        5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working. Note that opening or closing a popup/modal in the same page is not considered as pagination.
                         6. If the pagination button is working, call the tool with `true`. Otherwise, call the tool with `false`.
                         """
                     ),
@@ -216,24 +216,35 @@ async def compute_common_selectors(
         )
 
     @function
-    async def support_infinite_scroll(self, url: str) -> bool:
+    async def support_infinite_scroll(
+        self,
+        url: str,
+        items_selector: str = None,
+    ) -> bool:
         """
         Open the given URL and determine whether the page supports infinite scroll.
 
         Args:
             url: URL of the page
+            items_selector: CSS selector of the items on the page
         """
         # use long wait time for pages to be fully loaded
         await self.load_page(url, wait=3000)
 
         return await self.playwright.page.evaluate(
             """
-            () => {
+            (items_selector) => {
                 let mutateElementsCount = 0;
                 const threshold = 10;
                 
                 const npiScrollObserver = new MutationObserver((records) => {
-                    mutateElementsCount += records.length;
+                    for (const record of records) {
+                        for (const node of record.addedNodes) {
+                            if (node.nodeType === Node.ELEMENT_NODE && node.matches(items_selector)) {
+                                mutateElementsCount++;
+                            }
+                        }
+                    }
                 });
                 
                 npiScrollObserver.observe(
@@ -272,6 +283,7 @@ async def support_infinite_scroll(self, url: str) -> bool:
                 });
             }
             """,
+            items_selector or "*",
         )
 
     @function

diff --git a/npiai/tools/web/scraper/__test__/interactive.py b/npiai/tools/web/scraper/__test__/interactive.py
@@ -1,80 +1,25 @@
 import asyncio
-import json
-from textwrap import indent
-
-from npiai.tools.web.scraper import Scraper
-from npiai.tools.web.page_analyzer import PageAnalyzer
 
 # from npiai.utils.test_utils import DebugContext
 from npiai import Context
+from npiai.tools.web.page_analyzer import PageAnalyzer
+from npiai.tools.web.scraper import Scraper
+from utils import autos_scrape
 
 
 async def main():
     url = input("Enter the URL: ")
-    ctx = Context()
 
     async with PageAnalyzer(headless=False) as analyzer:
         scraper = Scraper(batch_size=10, playwright=analyzer.playwright)
 
-        print(f"Analyzing {url}:")
-
-        infinite_scroll = await analyzer.support_infinite_scroll(
-            url=url,
-        )
-
-        print("  - Support infinite scroll:", infinite_scroll)
-
-        pagination = await analyzer.get_pagination_button(
-            ctx=ctx,
-            url=url,
-        )
-
-        print("  - Pagination button:", pagination)
-
-        scraping_type = await analyzer.infer_scraping_type(
-            ctx=ctx,
+        await autos_scrape(
+            ctx=Context(),
+            analyzer=analyzer,
+            scraper=scraper,
             url=url,
         )
 
-        print("  - Inferred scraping type:", scraping_type)
-
-        if scraping_type == "list-like":
-            selectors = await analyzer.infer_similar_items_selector(ctx, url)
-
-            print(
-                "  - Possible selectors:",
-                indent(json.dumps(selectors, indent=2), prefix="    ").lstrip(),
-            )
-
-        if not selectors:
-            return
-
-        columns = await scraper.infer_columns(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-        )
-
-        print(
-            "  - Inferred columns:",
-            indent(json.dumps(columns, indent=2), prefix="   ").lstrip(),
-        )
-
-        stream = scraper.summarize_stream(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-            output_columns=columns,
-            limit=10,
-        )
-
-        async for items in stream:
-            print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))
-
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/npiai/tools/web/scraper/__test__/twitter.py b/npiai/tools/web/scraper/__test__/twitter.py
@@ -1,83 +1,22 @@
 import asyncio
-import json
-from textwrap import indent
-
-from npiai.tools.web.scraper import Scraper
-from npiai.tools.web.page_analyzer import PageAnalyzer
-from npiai.tools.web.twitter import Twitter
 
 # from npiai.utils.test_utils import DebugContext
 from npiai import Context
-
-url = "https://x.com/home"
+from npiai.tools.web.page_analyzer import PageAnalyzer
+from npiai.tools.web.scraper import Scraper
+from npiai.tools.web.twitter import Twitter
+from utils import autos_scrape
 
 
 async def main():
-    ctx = Context()
-
     async with Twitter(headless=False) as twitter:
-        analyzer = PageAnalyzer(playwright=twitter.playwright)
-        scraper = Scraper(batch_size=10, playwright=twitter.playwright)
-
-        print(f"Analyzing {url}:")
-
-        infinite_scroll = await analyzer.support_infinite_scroll(
-            url=url,
-        )
-
-        print("  - Support infinite scroll:", infinite_scroll)
-
-        pagination = await analyzer.get_pagination_button(
-            ctx=ctx,
-            url=url,
+        await autos_scrape(
+            ctx=Context(),
+            analyzer=PageAnalyzer(playwright=twitter.playwright),
+            scraper=Scraper(batch_size=10, playwright=twitter.playwright),
+            url="https://x.com/home",
         )
 
-        print("  - Pagination button:", pagination)
-
-        scraping_type = await analyzer.infer_scraping_type(
-            ctx=ctx,
-            url=url,
-        )
-
-        print("  - Inferred scraping type:", scraping_type)
-
-        if scraping_type == "list-like":
-            selectors = await analyzer.infer_similar_items_selector(ctx, url)
-
-            print(
-                "  - Possible selectors:",
-                indent(json.dumps(selectors, indent=2), prefix="    ").lstrip(),
-            )
-
-        if not selectors:
-            return
-
-        columns = await scraper.infer_columns(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-        )
-
-        print(
-            "  - Inferred columns:",
-            indent(json.dumps(columns, indent=2), prefix="   ").lstrip(),
-        )
-
-        stream = scraper.summarize_stream(
-            ctx=ctx,
-            url=url,
-            scraping_type="list-like",
-            ancestor_selector=selectors["ancestor"],
-            items_selector=selectors["items"],
-            output_columns=columns,
-            limit=10,
-        )
-
-        async for items in stream:
-            print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))
-
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/npiai/tools/web/scraper/__test__/utils.py b/npiai/tools/web/scraper/__test__/utils.py
@@ -0,0 +1,80 @@
+import json
+from textwrap import indent
+
+from npiai.tools.web.scraper import Scraper
+from npiai.tools.web.page_analyzer import PageAnalyzer
+
+# from npiai.utils.test_utils import DebugContext
+from npiai import Context
+
+
+async def autos_scrape(
+    ctx: Context,
+    analyzer: PageAnalyzer,
+    scraper: Scraper,
+    url: str,
+):
+    ancestor_selector = None
+    items_selector = None
+
+    print(f"Analyzing {url}:")
+
+    scraping_type = await analyzer.infer_scraping_type(
+        ctx=ctx,
+        url=url,
+    )
+
+    print("  - Inferred scraping type:", scraping_type)
+
+    if scraping_type == "list-like":
+        selectors = await analyzer.infer_similar_items_selector(ctx, url)
+
+        print(
+            "  - Possible selectors:",
+            indent(json.dumps(selectors, indent=2), prefix="    ").lstrip(),
+        )
+
+        if selectors:
+            ancestor_selector = selectors["ancestor"]
+            items_selector = selectors["items"]
+
+    infinite_scroll = await analyzer.support_infinite_scroll(
+        url=url,
+        items_selector=items_selector,
+    )
+
+    print("  - Support infinite scroll:", infinite_scroll)
+
+    pagination_button_selector = await analyzer.get_pagination_button(
+        ctx=ctx,
+        url=url,
+    )
+
+    print("  - Pagination button:", pagination_button_selector)
+
+    columns = await scraper.infer_columns(
+        ctx=ctx,
+        url=url,
+        scraping_type=scraping_type,
+        ancestor_selector=ancestor_selector,
+        items_selector=items_selector,
+    )
+
+    print(
+        "  - Inferred columns:",
+        indent(json.dumps(columns, indent=2), prefix="   ").lstrip(),
+    )
+
+    stream = scraper.summarize_stream(
+        ctx=ctx,
+        url=url,
+        scraping_type=scraping_type,
+        ancestor_selector=ancestor_selector,
+        items_selector=items_selector,
+        pagination_button_selector=pagination_button_selector,
+        output_columns=columns,
+        limit=1 if scraping_type == "single" else 10,
+    )
+
+    async for items in stream:
+        print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))