Skip to content

Commit

Permalink
refactor(scraper): support structured column definitions
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 16, 2024
1 parent 9759093 commit f10bb2c
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 15 deletions.
19 changes: 18 additions & 1 deletion npiai/tools/web/scraper/__test__/bardeen.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,24 @@ async def main():
url="https://www.bardeen.ai/playbooks",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item-link",
output_columns=["Apps Name", "Description", "Category", "Time Saved"],
output_columns=[
{
"name": "Apps Name",
"description": "The name of the app",
},
{
"name": "Description",
"description": "The description of the app",
},
{
"name": "Category",
"description": "The category of the app",
},
{
"name": "Time Saved",
"description": "The time saved by using the app",
},
],
limit=42,
)

Expand Down
20 changes: 13 additions & 7 deletions npiai/tools/web/scraper/__test__/column_inference.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
import asyncio
import json

from npiai.tools.web.scraper import Scraper
from npiai.utils.test_utils import DebugContext

url = "https://www.bardeen.ai/playbooks"
ancestor_selector = ".playbook_list"
items_selector = ".playbook_list .playbook_item-link"


async def main():
async with Scraper(headless=False, batch_size=10) as scraper:
columns = await scraper.infer_columns(
ctx=DebugContext(),
url="https://www.bardeen.ai/playbooks",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item-link",
url=url,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
)

print("Inferred columns:", columns)
print("Inferred columns:", json.dumps(columns, indent=2))

await scraper.summarize(
ctx=DebugContext(),
url="https://www.bardeen.ai/playbooks",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item-link",
url=url,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
output_columns=columns,
limit=10,
)
Expand Down
35 changes: 28 additions & 7 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import json
from typing import List, Dict
from typing_extensions import TypedDict
from textwrap import dedent

from markdownify import MarkdownConverter
Expand All @@ -15,6 +16,11 @@
from npiai.utils import is_cloud_env, llm_tool_call


class Column(TypedDict):
name: str
description: str | None


class NonBase64ImageConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
src = el.attrs.get("src", "")
Expand Down Expand Up @@ -72,7 +78,7 @@ async def summarize(
self,
ctx: Context,
url: str,
output_columns: List[str],
output_columns: List[Column],
ancestor_selector: str | None = None,
items_selector: str | None = None,
pagination_button_selector: str | None = None,
Expand Down Expand Up @@ -157,7 +163,7 @@ async def infer_columns(
url: str,
ancestor_selector: str | None,
items_selector: str | None,
) -> List[str]:
) -> List[Column] | None:
"""
Infer the columns of the output table by finding the common nature of the items to summarize.
Expand All @@ -180,12 +186,15 @@ async def infer_columns(
limit=10,
)

def callback(columns: List[str]):
if not md:
return None

def callback(columns: List[Column]):
"""
Callback with the inferred columns.
Args:
columns: The inferred columns.
columns: The inferred columns. Each column is a dictionary with 'name' and 'description' keys, where 'description' is optional.
"""
return columns

Expand All @@ -197,7 +206,7 @@ def callback(columns: List[str]):
role="system",
content=dedent(
"""
Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table. Respond with the columns in a list format: ['column1', 'column2', ...]
Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table.
"""
),
),
Expand Down Expand Up @@ -324,15 +333,27 @@ async def _llm_summarize(
self,
ctx: Context,
md: str,
output_columns: List[str],
output_columns: List[Column],
) -> List[Dict[str, str]]:
column_defs = ""

for column in output_columns:
column_defs += (
f"{column['name']}: {column['description'] or 'No description'}\n"
)

messages = [
ChatCompletionSystemMessageParam(
role="system",
content=dedent(
f"""
You are a web scraper agent helping user summarize the content of a webpage into a table.
For the given markdown content, summarize the content into a table with the following columns: {json.dumps(output_columns, ensure_ascii=False)}.
For the given markdown content, summarize the content into a table with the following columns:
# Column Definitions
{column_defs}
# Response Format
Respond with the table in CSV format.
"""
),
Expand Down

0 comments on commit f10bb2c

Please sign in to comment.