Skip to content

Commit

Permalink
refactor(scraper): wait for items to be attached
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 18, 2024
1 parent cdc6cc3 commit eeef547
Showing 1 changed file with 16 additions and 0 deletions.
16 changes: 16 additions & 0 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List, Dict, AsyncGenerator
from typing_extensions import TypedDict, Annotated
from textwrap import dedent
from playwright.async_api import TimeoutError

from litellm.types.completion import (
ChatCompletionSystemMessageParam,
Expand Down Expand Up @@ -269,6 +270,15 @@ async def _get_items_md(

unvisited_selector = items_selector + ":not([data-npi-visited])"

# wait for the first unvisited item to be attached to the DOM
try:
await self.playwright.page.locator(unvisited_selector).first.wait_for(
state="attached",
timeout=30_000,
)
except TimeoutError:
return None

htmls = await self.playwright.page.evaluate(
"""
([unvisited_selector, limit]) => {
Expand Down Expand Up @@ -326,6 +336,12 @@ async def _get_ancestor_md(
if htmls is None:
locator = self.playwright.page.locator(ancestor_selector)

# wait for the ancestor element to be attached to the DOM
try:
await locator.first.wait_for(state="attached", timeout=30_000)
except TimeoutError:
return None

if not await locator.count():
return None

Expand Down

0 comments on commit eeef547

Please sign in to comment.