Skip to content

Commit

Permalink
feat(analyzer): support items_selector in infinite scroll detection
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 21, 2024
1 parent 5600f7a commit 8c4976a
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 136 deletions.
20 changes: 16 additions & 4 deletions npiai/tools/web/page_analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def callback(is_next_page: bool):
2. Compare the old URL and the new URL to see if the page is navigated to the next page.
3. Compare the old title and the new title to see the two pages are related.
4. Compare the first screenshot (the screenshot before clicking the pagination button) with the second screenshot (the screenshot after clicking the pagination button) to see if there are any differences.
5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working.
5. Check if previous page and the next page have the same structure but different content. If so, the pagination button is working. Note that opening or closing a popup/modal in the same page is not considered as pagination.
6. If the pagination button is working, call the tool with `true`. Otherwise, call the tool with `false`.
"""
),
Expand Down Expand Up @@ -216,24 +216,35 @@ async def compute_common_selectors(
)

@function
async def support_infinite_scroll(self, url: str) -> bool:
async def support_infinite_scroll(
self,
url: str,
items_selector: str = None,
) -> bool:
"""
Open the given URL and determine whether the page supports infinite scroll.
Args:
url: URL of the page
items_selector: CSS selector of the items on the page
"""
# use long wait time for pages to be fully loaded
await self.load_page(url, wait=3000)

return await self.playwright.page.evaluate(
"""
() => {
(items_selector) => {
let mutateElementsCount = 0;
const threshold = 10;
const npiScrollObserver = new MutationObserver((records) => {
mutateElementsCount += records.length;
for (const record of records) {
for (const node of record.addedNodes) {
if (node.nodeType === Node.ELEMENT_NODE && node.matches(items_selector)) {
mutateElementsCount++;
}
}
}
});
npiScrollObserver.observe(
Expand Down Expand Up @@ -272,6 +283,7 @@ async def support_infinite_scroll(self, url: str) -> bool:
});
}
""",
items_selector or "*",
)

@function
Expand Down
69 changes: 7 additions & 62 deletions npiai/tools/web/scraper/__test__/interactive.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,25 @@
import asyncio
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer

# from npiai.utils.test_utils import DebugContext
from npiai import Context
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.scraper import Scraper
from utils import autos_scrape


async def main():
url = input("Enter the URL: ")
ctx = Context()

async with PageAnalyzer(headless=False) as analyzer:
scraper = Scraper(batch_size=10, playwright=analyzer.playwright)

print(f"Analyzing {url}:")

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
)

print(" - Pagination button:", pagination)

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
await autos_scrape(
ctx=Context(),
analyzer=analyzer,
scraper=scraper,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), prefix=" ").lstrip(),
)

if not selectors:
return

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
)

print(
" - Inferred columns:",
indent(json.dumps(columns, indent=2), prefix=" ").lstrip(),
)

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
output_columns=columns,
limit=10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))


if __name__ == "__main__":
asyncio.run(main())
79 changes: 9 additions & 70 deletions npiai/tools/web/scraper/__test__/twitter.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,22 @@
import asyncio
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.twitter import Twitter

# from npiai.utils.test_utils import DebugContext
from npiai import Context

url = "https://x.com/home"
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.scraper import Scraper
from npiai.tools.web.twitter import Twitter
from utils import autos_scrape


async def main():
ctx = Context()

async with Twitter(headless=False) as twitter:
analyzer = PageAnalyzer(playwright=twitter.playwright)
scraper = Scraper(batch_size=10, playwright=twitter.playwright)

print(f"Analyzing {url}:")

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
await autos_scrape(
ctx=Context(),
analyzer=PageAnalyzer(playwright=twitter.playwright),
scraper=Scraper(batch_size=10, playwright=twitter.playwright),
url="https://x.com/home",
)

print(" - Pagination button:", pagination)

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), prefix=" ").lstrip(),
)

if not selectors:
return

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
)

print(
" - Inferred columns:",
indent(json.dumps(columns, indent=2), prefix=" ").lstrip(),
)

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
scraping_type="list-like",
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
output_columns=columns,
limit=10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))


if __name__ == "__main__":
asyncio.run(main())
80 changes: 80 additions & 0 deletions npiai/tools/web/scraper/__test__/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer

# from npiai.utils.test_utils import DebugContext
from npiai import Context


async def autos_scrape(
ctx: Context,
analyzer: PageAnalyzer,
scraper: Scraper,
url: str,
):
ancestor_selector = None
items_selector = None

print(f"Analyzing {url}:")

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), prefix=" ").lstrip(),
)

if selectors:
ancestor_selector = selectors["ancestor"]
items_selector = selectors["items"]

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
items_selector=items_selector,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination_button_selector = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
)

print(" - Pagination button:", pagination_button_selector)

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
scraping_type=scraping_type,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
)

print(
" - Inferred columns:",
indent(json.dumps(columns, indent=2), prefix=" ").lstrip(),
)

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
scraping_type=scraping_type,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
pagination_button_selector=pagination_button_selector,
output_columns=columns,
limit=1 if scraping_type == "single" else 10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))

0 comments on commit 8c4976a

Please sign in to comment.