From 6084985de8817711830965c8a81fd4bb1a8a5450 Mon Sep 17 00:00:00 2001 From: Daofeng Wu Date: Fri, 18 Oct 2024 16:31:25 +0900 Subject: [PATCH] test(scraper): full-auto twitter scraping demo --- npiai/tools/web/scraper/__test__/twitter.py | 78 +++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 npiai/tools/web/scraper/__test__/twitter.py diff --git a/npiai/tools/web/scraper/__test__/twitter.py b/npiai/tools/web/scraper/__test__/twitter.py new file mode 100644 index 00000000..26ec0e6a --- /dev/null +++ b/npiai/tools/web/scraper/__test__/twitter.py @@ -0,0 +1,78 @@ +import asyncio +import json +from textwrap import indent + +from npiai.tools.web.scraper import Scraper +from npiai.tools.web.page_analyzer import PageAnalyzer +from npiai.tools.web.twitter import Twitter + +# from npiai.utils.test_utils import DebugContext +from npiai import Context + +url = "https://x.com/home" + + +async def main(): + ctx = Context() + + async with Twitter(headless=False) as twitter: + analyzer = PageAnalyzer(playwright=twitter.playwright) + scraper = Scraper(batch_size=10, playwright=twitter.playwright) + + print(f"Analyzing {url}:") + + infinite_scroll = await analyzer.support_infinite_scroll( + url=url, + ) + + print(" - Support infinite scroll:", infinite_scroll) + + pagination = await analyzer.get_pagination_button( + ctx=ctx, + url=url, + ) + + print(" - Pagination button:", pagination) + + scraping_type = await analyzer.infer_scraping_type( + ctx=ctx, + url=url, + ) + + print(" - Inferred scraping type:", scraping_type) + + if scraping_type == "list-like": + selectors = await analyzer.infer_similar_items_selector(ctx, url) + + print( + " - Possible selectors:", + indent(json.dumps(selectors, indent=2), " ").lstrip(), + ) + + if not selectors: + return + + columns = await scraper.infer_columns( + ctx=ctx, + url=url, + ancestor_selector=selectors["ancestor"], + items_selector=selectors["items"], + ) + + print("Inferred columns:", json.dumps(columns, indent=2)) + + stream = scraper.summarize_stream( + ctx=ctx, + url=url, + ancestor_selector=selectors["ancestor"], + items_selector=selectors["items"], + output_columns=columns, + limit=10, + ) + + async for items in stream: + print("Chunk:", json.dumps(items, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + asyncio.run(main())