Skip to content

Commit

Permalink
refactor(scraper): support structured column definitions
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 16, 2024
1 parent 9759093 commit f51070f
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 15 deletions.
20 changes: 19 additions & 1 deletion npiai/tools/web/scraper/__test__/bardeen.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,25 @@ async def main():
url="https://www.bardeen.ai/playbooks",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item-link",
output_columns=["Apps Name", "Description", "Category", "Time Saved"],
output_file=".cache/bardeen.json",
output_columns=[
{
"name": "Apps Name",
"description": "The name of the app",
},
{
"name": "Description",
"description": "The description of the app",
},
{
"name": "Category",
"description": "The category of the app",
},
{
"name": "Time Saved",
"description": "The time saved by using the app",
},
],
limit=42,
)

Expand Down
21 changes: 14 additions & 7 deletions npiai/tools/web/scraper/__test__/column_inference.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
import asyncio
import json

from npiai.tools.web.scraper import Scraper
from npiai.utils.test_utils import DebugContext

url = "https://www.bardeen.ai/playbooks"
ancestor_selector = ".playbook_list"
items_selector = ".playbook_list .playbook_item-link"


async def main():
async with Scraper(headless=False, batch_size=10) as scraper:
columns = await scraper.infer_columns(
ctx=DebugContext(),
url="https://www.bardeen.ai/playbooks",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item-link",
url=url,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
)

print("Inferred columns:", columns)
print("Inferred columns:", json.dumps(columns, indent=2))

await scraper.summarize(
ctx=DebugContext(),
url="https://www.bardeen.ai/playbooks",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item-link",
url=url,
ancestor_selector=ancestor_selector,
items_selector=items_selector,
output_columns=columns,
output_file=".cache/bardeen.json",
limit=10,
)

Expand Down
38 changes: 31 additions & 7 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import csv
import re
import json
import os
from typing import List, Dict
from typing_extensions import TypedDict, Annotated
from textwrap import dedent

from markdownify import MarkdownConverter
Expand All @@ -15,6 +17,11 @@
from npiai.utils import is_cloud_env, llm_tool_call


class Column(TypedDict):
name: Annotated[str, "Name of the column"]
description: Annotated[str | None, "Brief description of the column"]


class NonBase64ImageConverter(MarkdownConverter):
def convert_img(self, el, text, convert_as_inline):
src = el.attrs.get("src", "")
Expand Down Expand Up @@ -72,7 +79,7 @@ async def summarize(
self,
ctx: Context,
url: str,
output_columns: List[str],
output_columns: List[Column],
ancestor_selector: str | None = None,
items_selector: str | None = None,
pagination_button_selector: str | None = None,
Expand Down Expand Up @@ -145,6 +152,8 @@ async def summarize(

final_results = results[:limit] if limit != -1 else results

os.makedirs(os.path.dirname(output_file), exist_ok=True)

with open(output_file, "w") as f:
f.write(json.dumps(final_results, indent=4, ensure_ascii=False))

Expand All @@ -157,7 +166,7 @@ async def infer_columns(
url: str,
ancestor_selector: str | None,
items_selector: str | None,
) -> List[str]:
) -> List[Column] | None:
"""
Infer the columns of the output table by finding the common nature of the items to summarize.
Expand All @@ -180,12 +189,15 @@ async def infer_columns(
limit=10,
)

def callback(columns: List[str]):
if not md:
return None

def callback(columns: List[Column]):
"""
Callback with the inferred columns.
Args:
columns: The inferred columns.
columns: The inferred columns. Each column is a dictionary with 'name' and 'description' keys, where 'description' is optional.
"""
return columns

Expand All @@ -197,7 +209,7 @@ def callback(columns: List[str]):
role="system",
content=dedent(
"""
Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table. Respond with the columns in a list format: ['column1', 'column2', ...]
Imagine you are summarizing the content of a webpage into a table. Find the common nature of the provided items and suggest the columns for the output table.
"""
),
),
Expand Down Expand Up @@ -324,15 +336,27 @@ async def _llm_summarize(
self,
ctx: Context,
md: str,
output_columns: List[str],
output_columns: List[Column],
) -> List[Dict[str, str]]:
column_defs = ""

for column in output_columns:
column_defs += (
f"{column['name']}: {column['description'] or 'No description'}\n"
)

messages = [
ChatCompletionSystemMessageParam(
role="system",
content=dedent(
f"""
You are a web scraper agent helping user summarize the content of a webpage into a table.
For the given markdown content, summarize the content into a table with the following columns: {json.dumps(output_columns, ensure_ascii=False)}.
For the given markdown content, summarize the content into a table with the following columns:
# Column Definitions
{column_defs}
# Response Format
Respond with the table in CSV format.
"""
),
Expand Down

0 comments on commit f51070f

Please sign in to comment.