refactor(scraper): support structured column definitions

npi-ai · Oct 16, 2024 · f51070f · f51070f
1 parent 9759093
commit f51070f
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 15 deletions.
diff --git a/npiai/tools/web/scraper/__test__/bardeen.py b/npiai/tools/web/scraper/__test__/bardeen.py
@@ -10,7 +10,25 @@ async def main():
             url="https://www.bardeen.ai/playbooks",
             ancestor_selector=".playbook_list",
             items_selector=".playbook_list .playbook_item-link",
-            output_columns=["Apps Name", "Description", "Category", "Time Saved"],
+            output_file=".cache/bardeen.json",
+            output_columns=[
+                {
+                    "name": "Apps Name",
+                    "description": "The name of the app",
+                },
+                {
+                    "name": "Description",
+                    "description": "The description of the app",
+                },
+                {
+                    "name": "Category",
+                    "description": "The category of the app",
+                },
+                {
+                    "name": "Time Saved",
+                    "description": "The time saved by using the app",
+                },
+            ],
             limit=42,
         )
 

diff --git a/npiai/tools/web/scraper/__test__/column_inference.py b/npiai/tools/web/scraper/__test__/column_inference.py
@@ -1,25 +1,32 @@
 import asyncio
+import json
+
 from npiai.tools.web.scraper import Scraper
 from npiai.utils.test_utils import DebugContext
 
+url = "https://www.bardeen.ai/playbooks"
+ancestor_selector = ".playbook_list"
+items_selector = ".playbook_list .playbook_item-link"
+
 
 async def main():
     async with Scraper(headless=False, batch_size=10) as scraper:
         columns = await scraper.infer_columns(
             ctx=DebugContext(),
-            url="https://www.bardeen.ai/playbooks",
-            ancestor_selector=".playbook_list",
-            items_selector=".playbook_list .playbook_item-link",
+            url=url,
+            ancestor_selector=ancestor_selector,
+            items_selector=items_selector,
         )
 
-        print("Inferred columns:", columns)
+        print("Inferred columns:", json.dumps(columns, indent=2))
 
         await scraper.summarize(
             ctx=DebugContext(),
-            url="https://www.bardeen.ai/playbooks",
-            ancestor_selector=".playbook_list",
-            items_selector=".playbook_list .playbook_item-link",
+            url=url,
+            ancestor_selector=ancestor_selector,
+            items_selector=items_selector,
             output_columns=columns,
+            output_file=".cache/bardeen.json",
             limit=10,
         )
 

diff --git a/npiai/tools/web/scraper/app.py b/npiai/tools/web/scraper/app.py
@@ -1,7 +1,9 @@
 import csv
 import re
 import json
+import os
 from typing import List, Dict
+from typing_extensions import TypedDict, Annotated
 from textwrap import dedent
 
 from markdownify import MarkdownConverter
@@ -15,6 +17,11 @@
 from npiai.utils import is_cloud_env, llm_tool_call
 
 
+class Column(TypedDict):
+    name: Annotated[str, "Name of the column"]
+    description: Annotated[str | None, "Brief description of the column"]
+
+
 class NonBase64ImageConverter(MarkdownConverter):
     def convert_img(self, el, text, convert_as_inline):
         src = el.attrs.get("src", "")
@@ -72,7 +79,7 @@ async def summarize(
         self,
         ctx: Context,
         url: str,
-        output_columns: List[str],
+        output_columns: List[Column],
         ancestor_selector: str | None = None,
         items_selector: str | None = None,
         pagination_button_selector: str | None = None,
@@ -145,6 +152,8 @@ async def summarize(
 
         final_results = results[:limit] if limit != -1 else results
 
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+
         with open(output_file, "w") as f:
             f.write(json.dumps(final_results, indent=4, ensure_ascii=False))
 
@@ -157,7 +166,7 @@ async def infer_columns(
         url: str,
         ancestor_selector: str | None,
         items_selector: str | None,
-    ) -> List[str]:
+    ) -> List[Column] | None:
         """
         Infer the columns of the output table by finding the common nature of the items to summarize.
 
@@ -180,12 +189,15 @@ async def infer_columns(
             limit=10,
         )
 
-        def callback(columns: List[str]):
+        if not md:
+            return None
+
+        def callback(columns: List[Column]):
             """
             Callback with the inferred columns.
 
             Args:
-                columns: The inferred columns.
+                columns: The inferred columns. Each column is a dictionary with 'name' and 'description' keys, where 'description' is optional.
             """
             return columns
 
@@ -197,7 +209,7 @@ def callback(columns: List[str]):
                     role="system",
                     content=dedent(
                         """
-                        Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table. Respond with the columns in a list format: ['column1', 'column2', ...]
+                        Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table.
                         """
                     ),
                 ),
@@ -324,15 +336,27 @@ async def _llm_summarize(
         self,
         ctx: Context,
         md: str,
-        output_columns: List[str],
+        output_columns: List[Column],
     ) -> List[Dict[str, str]]:
+        column_defs = ""
+
+        for column in output_columns:
+            column_defs += (
+                f"{column['name']}: {column['description'] or 'No description'}\n"
+            )
+
         messages = [
             ChatCompletionSystemMessageParam(
                 role="system",
                 content=dedent(
                     f"""
                     You are a web scraper agent helping user summarize the content of a webpage into a table.
-                    For the given markdown content, summarize the content into a table with the following columns: {json.dumps(output_columns, ensure_ascii=False)}.
+                    For the given markdown content, summarize the content into a table with the following columns:
+                    
+                    # Column Definitions
+                    {column_defs}
+                    
+                    # Response Format
                     Respond with the table in CSV format.
                     """
                 ),