Skip to content

Commit

Permalink
Merge pull request #3 from zytedata/redefined-request-proba
Browse files Browse the repository at this point in the history
redefine how request probabilities are computed
  • Loading branch information
kmike authored Oct 27, 2023
2 parents 2abab5c + b6a103f commit 755dbd8
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 14 deletions.
16 changes: 12 additions & 4 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def test_crawl():

subcategories = {
"subCategories": [
{"url": subcategory_urls[0]},
{"url": subcategory_urls[1]},
{"url": subcategory_urls[0], "metadata": {"probability": 0.95}},
{"url": subcategory_urls[1], "metadata": {"probability": 0.78}},
],
}
nextpage = {"nextPage": {"url": nextpage_url}}
items = {
"items": [
{"url": item_urls[0]},
{"url": item_urls[1]},
{"url": item_urls[0], "metadata": {"probability": 0.99}},
{"url": item_urls[1], "metadata": {"probability": 0.83}},
],
}

Expand All @@ -86,8 +86,10 @@ def test_crawl():
assert len(requests) == 2
assert requests[0].url == subcategory_urls[0]
assert requests[0].callback == spider.parse_navigation
assert requests[0].priority == 95
assert requests[1].url == subcategory_urls[1]
assert requests[1].callback == spider.parse_navigation
assert requests[1].priority == 78

# subcategories + nextpage
navigation = ProductNavigation.from_dict(
Expand All @@ -102,6 +104,7 @@ def test_crawl():
urls = {request.url for request in requests}
assert urls == {*subcategory_urls, nextpage_url}
assert all(request.callback == spider.parse_navigation for request in requests)
assert [request.priority for request in requests] == [100, 95, 78]

# subcategories + nextpage + items
navigation = ProductNavigation.from_dict(
Expand All @@ -120,6 +123,7 @@ def test_crawl():
assert request.callback == spider.parse_product
else:
assert request.callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 100, 95, 78]

# nextpage + items
navigation = ProductNavigation.from_dict(
Expand All @@ -137,6 +141,7 @@ def test_crawl():
assert requests[1].callback == spider.parse_product
assert requests[2].url == nextpage_url
assert requests[2].callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 100]

# subcategories + items
navigation = ProductNavigation.from_dict(
Expand All @@ -156,6 +161,7 @@ def test_crawl():
assert requests[2].callback == spider.parse_navigation
assert requests[3].url == subcategory_urls[1]
assert requests[3].callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 95, 78]

# nextpage
navigation = ProductNavigation.from_dict(
Expand All @@ -168,6 +174,7 @@ def test_crawl():
assert len(requests) == 1
assert requests[0].url == nextpage_url
assert requests[0].callback == spider.parse_navigation
assert [request.priority for request in requests] == [100]

# items
navigation = ProductNavigation.from_dict(
Expand All @@ -182,6 +189,7 @@ def test_crawl():
assert requests[0].callback == spider.parse_product
assert requests[1].url == item_urls[1]
assert requests[1].callback == spider.parse_product
assert [request.priority for request in requests] == [199, 183]

# Test parse_navigation() behavior on pagination_only crawl strategy.
spider = EcommerceSpider(
Expand Down
24 changes: 15 additions & 9 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from importlib.metadata import version
from typing import Any, Callable, Dict, Optional
from typing import Any, Callable, Dict, Optional, Union

import scrapy
from pydantic import BaseModel, Field
from scrapy.crawler import Crawler
from scrapy.utils.url import parse_url
from zyte_common_items import Request
from zyte_common_items import ProbabilityRequest, Request

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS_WITH_CODE,
Expand Down Expand Up @@ -57,7 +57,7 @@ class BaseSpider(scrapy.Spider):
"description": "Base template.",
}

ITEM_REQUEST_PRIORITY: int = 10
_NEXT_PAGE_PRIORITY: int = 100

@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
Expand Down Expand Up @@ -86,18 +86,20 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
return spider

@staticmethod
def get_parse_navigation_request_priority(request: Request) -> int:
def get_parse_navigation_request_priority(
request: Union[ProbabilityRequest, Request]
) -> int:
if (
not hasattr(request, "metadata")
or not request.metadata
or request.metadata.probability is None
):
return 0
return int(10 * request.metadata.probability)
return int(100 * request.metadata.probability)

def get_parse_navigation_request(
self,
request: Request,
request: Union[ProbabilityRequest, Request],
callback: Optional[Callable] = None,
page_params: Optional[Dict[str, Any]] = None,
priority: Optional[int] = None,
Expand All @@ -109,11 +111,15 @@ def get_parse_navigation_request(
meta={"page_params": page_params or {}},
)

def get_parse_product_request_priority(self, request: Request) -> int:
return self.ITEM_REQUEST_PRIORITY
def get_parse_product_request_priority(self, request: ProbabilityRequest) -> int:
# TODO: Simplify when https://github.com/zytedata/zyte-common-items/pull/64 is released
probability = 0
if metadata := getattr(request, "metadata", None):
probability = metadata.probability
return int(100 * probability) + self._NEXT_PAGE_PRIORITY

def get_parse_product_request(
self, request: Request, callback: Optional[Callable] = None
self, request: ProbabilityRequest, callback: Optional[Callable] = None
) -> scrapy.Request:
callback = callback or self.parse_product
return request.to_scrapy(
Expand Down
2 changes: 1 addition & 1 deletion zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def parse_navigation(
if navigation.nextPage:
yield self.get_parse_navigation_request(
navigation.nextPage,
priority=self.ITEM_REQUEST_PRIORITY - 1,
priority=self._NEXT_PAGE_PRIORITY,
)

if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only:
Expand Down

0 comments on commit 755dbd8

Please sign in to comment.