Skip to content

Commit

Permalink
test(scraper): full-auto twitter scraping demo
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 18, 2024
1 parent eeef547 commit 6084985
Showing 1 changed file with 78 additions and 0 deletions.
78 changes: 78 additions & 0 deletions npiai/tools/web/scraper/__test__/twitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import asyncio
import json
from textwrap import indent

from npiai.tools.web.scraper import Scraper
from npiai.tools.web.page_analyzer import PageAnalyzer
from npiai.tools.web.twitter import Twitter

# from npiai.utils.test_utils import DebugContext
from npiai import Context

url = "https://x.com/home"


async def main():
ctx = Context()

async with Twitter(headless=False) as twitter:
analyzer = PageAnalyzer(playwright=twitter.playwright)
scraper = Scraper(batch_size=10, playwright=twitter.playwright)

print(f"Analyzing {url}:")

infinite_scroll = await analyzer.support_infinite_scroll(
url=url,
)

print(" - Support infinite scroll:", infinite_scroll)

pagination = await analyzer.get_pagination_button(
ctx=ctx,
url=url,
)

print(" - Pagination button:", pagination)

scraping_type = await analyzer.infer_scraping_type(
ctx=ctx,
url=url,
)

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
selectors = await analyzer.infer_similar_items_selector(ctx, url)

print(
" - Possible selectors:",
indent(json.dumps(selectors, indent=2), " ").lstrip(),
)

if not selectors:
return

columns = await scraper.infer_columns(
ctx=ctx,
url=url,
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
)

print("Inferred columns:", json.dumps(columns, indent=2))

stream = scraper.summarize_stream(
ctx=ctx,
url=url,
ancestor_selector=selectors["ancestor"],
items_selector=selectors["items"],
output_columns=columns,
limit=10,
)

async for items in stream:
print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False))


if __name__ == "__main__":
asyncio.run(main())

0 comments on commit 6084985

Please sign in to comment.