Skip to content

Commit

Permalink
feat(analyzer): support items_selector in infinite scroll detection
Browse files Browse the repository at this point in the history
idiotWu committed Oct 23, 2024
1 parent 444c805 commit 267e013
Showing 4 changed files with 122 additions and 139 deletions.
33 changes: 26 additions & 7 deletions npiai/tools/web/page_analyzer/app.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
from textwrap import dedent
from typing import Literal, List
from typing_extensions import TypedDict
from playwright.async_api import Error as PlaywrightError


from litellm.types.completion import (
@@ -51,7 +52,12 @@ async def _validate_pagination(self, ctx: Context, selector: str) -> bool:
old_title = await self.get_page_title()

await self.clear_bboxes()
await self.click(elem)

try:
await self.click(elem)
except PlaywrightError:
return False

await self.playwright.page.wait_for_timeout(3000)

new_screenshot = await self.get_screenshot(full_page=True)
@@ -92,7 +98,7 @@ def callback(is_next_page: bool):
2. Compare the old URL and the new URL to see if the page is navigated to the next page.
3. Compare the old title and the new title to see the two pages are related.
4. Compare the first screenshot (the screenshot before clicking the pagination button) with the second screenshot (the screenshot after clicking the pagination button) to see if there are any differences.
5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working.
5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working. Note that opening or closing a popup/modal in the same page is not considered as pagination.
6. If the pagination button is working, call the tool with `true`. Otherwise, call the tool with `false`.
"""
),
@@ -161,7 +167,8 @@ async def get_selector_of_marker(self, marker_id: int = -1) -> str | None:
)

async def compute_common_selectors(
self, anchor_ids: List[int]
self,
anchor_ids: List[int],
) -> CommonSelectors | None:
"""
Expand the anchors with the given IDs and compute the common items and ancestor selector.
@@ -216,24 +223,35 @@ async def compute_common_selectors(
)

@function
async def support_infinite_scroll(self, url: str) -> bool:
async def support_infinite_scroll(
self,
url: str,
items_selector: str = None,
) -> bool:
"""
Open the given URL and determine whether the page supports infinite scroll.
Args:
url: URL of the page
items_selector: CSS selector of the items on the page
"""
# use long wait time for pages to be fully loaded
await self.load_page(url, wait=3000)

return await self.playwright.page.evaluate(
"""
() => {
(items_selector) => {
let mutateElementsCount = 0;
const threshold = 10;
const threshold = items_selector === '*' ? 10 : 3;
const npiScrollObserver = new MutationObserver((records) => {
mutateElementsCount += records.length;
for (const record of records) {
for (const node of record.addedNodes) {
if (node.nodeType === Node.ELEMENT_NODE && node.matches(items_selector)) {
mutateElementsCount++;
}
}
}
});
npiScrollObserver.observe(
@@ -272,6 +290,7 @@ async def support_infinite_scroll(self, url: str) -> bool:
});
}
""",
items_selector or "*",
)

@function
69 changes: 7 additions & 62 deletions npiai/tools/web/scraper/__test__/interactive.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,25 @@
import asyncio
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer

# from npiai.utils.test_utils import DebugContext
from npiai import Context
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.scraper import Scraper
from utils import autos_scrape


async def main():
url = input("Enter the URL: ")
ctx = Context()

async with PageAnalyzer(headless=False) as analyzer:
scraper = Scraper(batch_size=10, playwright=analyzer.playwright)

print(f"Analyzing {url}:")

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
)

print(" - Pagination button:", pagination)

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
await autos_scrape(
ctx=Context(),
analyzer=analyzer,
scraper=scraper,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), prefix=" ").lstrip(),
)

if not selectors:
return

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
)

print(
" - Inferred columns:",
indent(json.dumps(columns, indent=2), prefix=" ").lstrip(),
)

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
output_columns=columns,
limit=10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))


if __name__ == "__main__":
asyncio.run(main())
79 changes: 9 additions & 70 deletions npiai/tools/web/scraper/__test__/twitter.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,22 @@
import asyncio
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.twitter import Twitter

# from npiai.utils.test_utils import DebugContext
from npiai import Context

url = "https://x.com/home"
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.scraper import Scraper
from npiai.tools.web.twitter import Twitter
from utils import autos_scrape


async def main():
ctx = Context()

async with Twitter(headless=False) as twitter:
analyzer = PageAnalyzer(playwright=twitter.playwright)
scraper = Scraper(batch_size=10, playwright=twitter.playwright)

print(f"Analyzing {url}:")

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
await autos_scrape(
ctx=Context(),
analyzer=PageAnalyzer(playwright=twitter.playwright),
scraper=Scraper(batch_size=10, playwright=twitter.playwright),
url="https://x.com/home",
)

print(" - Pagination button:", pagination)

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), prefix=" ").lstrip(),
)

if not selectors:
return

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
)

print(
" - Inferred columns:",
indent(json.dumps(columns, indent=2), prefix=" ").lstrip(),
)

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
output_columns=columns,
limit=10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))


if __name__ == "__main__":
asyncio.run(main())
80 changes: 80 additions & 0 deletions npiai/tools/web/scraper/__test__/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer

# from npiai.utils.test_utils import DebugContext
from npiai import Context


async def autos_scrape(
ctx: Context,
analyzer: PageAnalyzer,
scraper: Scraper,
url: str,
):
ancestor_selector = None
items_selector = None

print(f"Analyzing {url}:")

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), prefix=" ").lstrip(),
)

if selectors:
ancestor_selector = selectors["ancestor"]
items_selector = selectors["items"]

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
items_selector=items_selector,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination_button_selector = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
)

print(" - Pagination button:", pagination_button_selector)

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
scraping_type=scraping_type,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
)

print(
" - Inferred columns:",
indent(json.dumps(columns, indent=2), prefix=" ").lstrip(),
)

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
scraping_type=scraping_type,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
pagination_button_selector=pagination_button_selector,
output_columns=columns,
limit=1 if scraping_type == "single" else 10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))

0 comments on commit 267e013

Please sign in to comment.