Skip to content

Commit

Permalink
fix(scraper): use element handle to snapshot items
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Dec 16, 2024
1 parent bdf5989 commit 2c58bda
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,9 @@ async def _parse_items(
count = 0
marking_tasks = []

for item_locator in await locator.all():
html = await item_locator.evaluate("elem => elem.outerHTML")
# use element handles here to snapshot the items
for elem in await locator.element_handles():
html = await elem.evaluate("elem => elem.outerHTML")
markdown, md5 = self._html_to_md_and_hash(html)

if skip_item_hashes and md5 in skip_item_hashes:
Expand All @@ -445,7 +446,7 @@ async def _parse_items(
# mark the item as visited
marking_tasks.append(
asyncio.create_task(
item_locator.evaluate(
elem.evaluate(
"elem => elem.setAttribute('data-npi-visited', 'true')"
)
)
Expand Down

0 comments on commit 2c58bda

Please sign in to comment.