Skip to content

Commit

Permalink
refactor(scraper): try calling navigator if no more items loaded after
Browse files Browse the repository at this point in the history
scrolling
  • Loading branch information
idiotWu committed Oct 1, 2024
1 parent 5b2b620 commit 968ecbf
Showing 1 changed file with 36 additions and 24 deletions.
60 changes: 36 additions & 24 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ def convert_img(self, el, text, convert_as_inline):
el.attrs["src"] = "<base64_image>"
return super().convert_img(el, text, convert_as_inline)

def convert_div(self, el, text, convert_as_inline):
if convert_as_inline or not text:
return text

return f"{text}\n\n"
# def convert_div(self, el, text, convert_as_inline):
# if convert_as_inline or not text:
# return text
#
# return f"{text}\n\n"


def html_to_markdown(html: str, **options) -> str:
Expand Down Expand Up @@ -128,7 +128,7 @@ async def get_md():
if limit != -1 and len(results) >= limit:
break

await self._load_more(ctx, ancestor_selector)
await self._load_more(ctx, ancestor_selector, items_selector)

final_results = results[:limit] if limit != -1 else results

Expand Down Expand Up @@ -291,52 +291,64 @@ async def _infer_columns(self, ctx: Context, md: str) -> List[str]:

return parse_json_response(content)

async def _load_more(self, ctx: Context, ancestor_selector: str):
async def _load_more(
self,
ctx: Context,
ancestor_selector: str,
items_selector: str | None,
):
# attach mutation observer to the ancestor element
await self.playwright.page.evaluate(
"""
() => {
([ancestor_selector, items_selector]) => {
window.addedNodes = [];
window.npiObserver = new MutationObserver((records) => {
for (const record of records) {
for (const addedNode of record.addedNodes) {
window.addedNodes.push(addedNode);
if (addedNode.nodeType === Node.ELEMENT_NODE &&
(addedNode.matches(items_selector) || addedNode.querySelector(items_selector))
) {
window.addedNodes.push(addedNode);
}
}
}
});
}
"""
)

await self.playwright.page.evaluate(
f"""
() => {{
window.npiObserver.observe(
document.querySelector("{ancestor_selector}"),
{{ childList: true, subtree: true }}
document.querySelector(ancestor_selector),
{ childList: true, subtree: true }
);
}}
"""
}
""",
[ancestor_selector, items_selector or "*"],
)

more_content_loaded = False

# check if the page is scrollable
# if so, scroll to load more items
if await self.is_scrollable():
locator = self.playwright.page.locator(ancestor_selector)
await locator.evaluate("el => el.scrollIntoView({block: 'end'})")
await ctx.send_debug_message(f"[{self.name}] Scrolled to load more items")
else:
await self.playwright.page.wait_for_timeout(3000)
more_content_loaded = await self.playwright.page.evaluate(
"() => !!window.addedNodes?.length"
)

if not more_content_loaded:
# otherwise, check if there is a pagination element
# if so, navigate to the next page using navigator
await self.back_to_top()
await self._navigator.chat(
ctx=ctx,
# TODO: optimize the instruction
instruction="Check if there is a pagination element on the webpage. If the element exists, navigate to the next page. If you can't see a pagination element, continue scrolling down while the page allows it, in an attempt to locate one. If there's no pagination element after exhaustive scrolling, stop and take no further action.",
)
# return to the top of the page to start over scraping
await self.back_to_top()
await ctx.send_debug_message(f"[{self.name}] Navigated to the next page")

await self.playwright.page.wait_for_timeout(3000)
await self.playwright.page.wait_for_timeout(3000)

# clear the mutation observer
await self.playwright.page.evaluate(
Expand Down

0 comments on commit 968ecbf

Please sign in to comment.