diff --git a/npiai/tools/web/scraper/__test__/bardeen.py b/npiai/tools/web/scraper/__test__/bardeen.py index 9a61119c..2c1e9f87 100644 --- a/npiai/tools/web/scraper/__test__/bardeen.py +++ b/npiai/tools/web/scraper/__test__/bardeen.py @@ -10,7 +10,24 @@ async def main(): url="https://www.bardeen.ai/playbooks", ancestor_selector=".playbook_list", items_selector=".playbook_list .playbook_item-link", - output_columns=["Apps Name", "Description", "Category", "Time Saved"], + output_columns=[ + { + "name": "Apps Name", + "description": "The name of the app", + }, + { + "name": "Description", + "description": "The description of the app", + }, + { + "name": "Category", + "description": "The category of the app", + }, + { + "name": "Time Saved", + "description": "The time saved by using the app", + }, + ], limit=42, ) diff --git a/npiai/tools/web/scraper/__test__/column_inference.py b/npiai/tools/web/scraper/__test__/column_inference.py index ede2527f..aeee140b 100644 --- a/npiai/tools/web/scraper/__test__/column_inference.py +++ b/npiai/tools/web/scraper/__test__/column_inference.py @@ -1,24 +1,30 @@ import asyncio +import json + from npiai.tools.web.scraper import Scraper from npiai.utils.test_utils import DebugContext +url = "https://www.bardeen.ai/playbooks" +ancestor_selector = ".playbook_list" +items_selector = ".playbook_list .playbook_item-link" + async def main(): async with Scraper(headless=False, batch_size=10) as scraper: columns = await scraper.infer_columns( ctx=DebugContext(), - url="https://www.bardeen.ai/playbooks", - ancestor_selector=".playbook_list", - items_selector=".playbook_list .playbook_item-link", + url=url, + ancestor_selector=ancestor_selector, + items_selector=items_selector, ) - print("Inferred columns:", columns) + print("Inferred columns:", json.dumps(columns, indent=2)) await scraper.summarize( ctx=DebugContext(), - url="https://www.bardeen.ai/playbooks", - ancestor_selector=".playbook_list", - items_selector=".playbook_list .playbook_item-link", + url=url, + ancestor_selector=ancestor_selector, + items_selector=items_selector, output_columns=columns, limit=10, ) diff --git a/npiai/tools/web/scraper/app.py b/npiai/tools/web/scraper/app.py index 074a9511..8d301fa4 100644 --- a/npiai/tools/web/scraper/app.py +++ b/npiai/tools/web/scraper/app.py @@ -2,6 +2,7 @@ import re import json from typing import List, Dict +from typing_extensions import TypedDict from textwrap import dedent from markdownify import MarkdownConverter @@ -15,6 +16,11 @@ from npiai.utils import is_cloud_env, llm_tool_call +class Column(TypedDict): + name: str + description: str | None + + class NonBase64ImageConverter(MarkdownConverter): def convert_img(self, el, text, convert_as_inline): src = el.attrs.get("src", "") @@ -72,7 +78,7 @@ async def summarize( self, ctx: Context, url: str, - output_columns: List[str], + output_columns: List[Column], ancestor_selector: str | None = None, items_selector: str | None = None, pagination_button_selector: str | None = None, @@ -157,7 +163,7 @@ async def infer_columns( url: str, ancestor_selector: str | None, items_selector: str | None, - ) -> List[str]: + ) -> List[Column] | None: """ Infer the columns of the output table by finding the common nature of the items to summarize. @@ -180,12 +186,15 @@ async def infer_columns( limit=10, ) - def callback(columns: List[str]): + if not md: + return None + + def callback(columns: List[Column]): """ Callback with the inferred columns. Args: - columns: The inferred columns. + columns: The inferred columns. Each column is a dictionary with 'name' and 'description' keys, where 'description' is optional. """ return columns @@ -197,7 +206,7 @@ def callback(columns: List[str]): role="system", content=dedent( """ - Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table. Respond with the columns in a list format: ['column1', 'column2', ...] + Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table. """ ), ), @@ -324,15 +333,27 @@ async def _llm_summarize( self, ctx: Context, md: str, - output_columns: List[str], + output_columns: List[Column], ) -> List[Dict[str, str]]: + column_defs = "" + + for column in output_columns: + column_defs += ( + f"{column['name']}: {column['description'] or 'No description'}\n" + ) + messages = [ ChatCompletionSystemMessageParam( role="system", content=dedent( f""" You are a web scraper agent helping user summarize the content of a webpage into a table. - For the given markdown content, summarize the content into a table with the following columns: {json.dumps(output_columns, ensure_ascii=False)}. + For the given markdown content, summarize the content into a table with the following columns: + + # Column Definitions + {column_defs} + + # Response Format Respond with the table in CSV format. """ ),