Skip to content

Commit 755dbd8

Browse files
authored
Merge pull request #3 from zytedata/redefined-request-proba
redefine how request probabilities are computed
2 parents 2abab5c + b6a103f commit 755dbd8

File tree

3 files changed

+28
-14
lines changed

3 files changed

+28
-14
lines changed

tests/test_ecommerce.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,15 @@ def test_crawl():
6060

6161
subcategories = {
6262
"subCategories": [
63-
{"url": subcategory_urls[0]},
64-
{"url": subcategory_urls[1]},
63+
{"url": subcategory_urls[0], "metadata": {"probability": 0.95}},
64+
{"url": subcategory_urls[1], "metadata": {"probability": 0.78}},
6565
],
6666
}
6767
nextpage = {"nextPage": {"url": nextpage_url}}
6868
items = {
6969
"items": [
70-
{"url": item_urls[0]},
71-
{"url": item_urls[1]},
70+
{"url": item_urls[0], "metadata": {"probability": 0.99}},
71+
{"url": item_urls[1], "metadata": {"probability": 0.83}},
7272
],
7373
}
7474

@@ -86,8 +86,10 @@ def test_crawl():
8686
assert len(requests) == 2
8787
assert requests[0].url == subcategory_urls[0]
8888
assert requests[0].callback == spider.parse_navigation
89+
assert requests[0].priority == 95
8990
assert requests[1].url == subcategory_urls[1]
9091
assert requests[1].callback == spider.parse_navigation
92+
assert requests[1].priority == 78
9193

9294
# subcategories + nextpage
9395
navigation = ProductNavigation.from_dict(
@@ -102,6 +104,7 @@ def test_crawl():
102104
urls = {request.url for request in requests}
103105
assert urls == {*subcategory_urls, nextpage_url}
104106
assert all(request.callback == spider.parse_navigation for request in requests)
107+
assert [request.priority for request in requests] == [100, 95, 78]
105108

106109
# subcategories + nextpage + items
107110
navigation = ProductNavigation.from_dict(
@@ -120,6 +123,7 @@ def test_crawl():
120123
assert request.callback == spider.parse_product
121124
else:
122125
assert request.callback == spider.parse_navigation
126+
assert [request.priority for request in requests] == [199, 183, 100, 95, 78]
123127

124128
# nextpage + items
125129
navigation = ProductNavigation.from_dict(
@@ -137,6 +141,7 @@ def test_crawl():
137141
assert requests[1].callback == spider.parse_product
138142
assert requests[2].url == nextpage_url
139143
assert requests[2].callback == spider.parse_navigation
144+
assert [request.priority for request in requests] == [199, 183, 100]
140145

141146
# subcategories + items
142147
navigation = ProductNavigation.from_dict(
@@ -156,6 +161,7 @@ def test_crawl():
156161
assert requests[2].callback == spider.parse_navigation
157162
assert requests[3].url == subcategory_urls[1]
158163
assert requests[3].callback == spider.parse_navigation
164+
assert [request.priority for request in requests] == [199, 183, 95, 78]
159165

160166
# nextpage
161167
navigation = ProductNavigation.from_dict(
@@ -168,6 +174,7 @@ def test_crawl():
168174
assert len(requests) == 1
169175
assert requests[0].url == nextpage_url
170176
assert requests[0].callback == spider.parse_navigation
177+
assert [request.priority for request in requests] == [100]
171178

172179
# items
173180
navigation = ProductNavigation.from_dict(
@@ -182,6 +189,7 @@ def test_crawl():
182189
assert requests[0].callback == spider.parse_product
183190
assert requests[1].url == item_urls[1]
184191
assert requests[1].callback == spider.parse_product
192+
assert [request.priority for request in requests] == [199, 183]
185193

186194
# Test parse_navigation() behavior on pagination_only crawl strategy.
187195
spider = EcommerceSpider(

zyte_spider_templates/spiders/base.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from importlib.metadata import version
2-
from typing import Any, Callable, Dict, Optional
2+
from typing import Any, Callable, Dict, Optional, Union
33

44
import scrapy
55
from pydantic import BaseModel, Field
66
from scrapy.crawler import Crawler
77
from scrapy.utils.url import parse_url
8-
from zyte_common_items import Request
8+
from zyte_common_items import ProbabilityRequest, Request
99

1010
from zyte_spider_templates._geolocations import (
1111
GEOLOCATION_OPTIONS_WITH_CODE,
@@ -57,7 +57,7 @@ class BaseSpider(scrapy.Spider):
5757
"description": "Base template.",
5858
}
5959

60-
ITEM_REQUEST_PRIORITY: int = 10
60+
_NEXT_PAGE_PRIORITY: int = 100
6161

6262
@classmethod
6363
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
@@ -86,18 +86,20 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
8686
return spider
8787

8888
@staticmethod
89-
def get_parse_navigation_request_priority(request: Request) -> int:
89+
def get_parse_navigation_request_priority(
90+
request: Union[ProbabilityRequest, Request]
91+
) -> int:
9092
if (
9193
not hasattr(request, "metadata")
9294
or not request.metadata
9395
or request.metadata.probability is None
9496
):
9597
return 0
96-
return int(10 * request.metadata.probability)
98+
return int(100 * request.metadata.probability)
9799

98100
def get_parse_navigation_request(
99101
self,
100-
request: Request,
102+
request: Union[ProbabilityRequest, Request],
101103
callback: Optional[Callable] = None,
102104
page_params: Optional[Dict[str, Any]] = None,
103105
priority: Optional[int] = None,
@@ -109,11 +111,15 @@ def get_parse_navigation_request(
109111
meta={"page_params": page_params or {}},
110112
)
111113

112-
def get_parse_product_request_priority(self, request: Request) -> int:
113-
return self.ITEM_REQUEST_PRIORITY
114+
def get_parse_product_request_priority(self, request: ProbabilityRequest) -> int:
115+
# TODO: Simplify when https://github.com/zytedata/zyte-common-items/pull/64 is released
116+
probability = 0
117+
if metadata := getattr(request, "metadata", None):
118+
probability = metadata.probability
119+
return int(100 * probability) + self._NEXT_PAGE_PRIORITY
114120

115121
def get_parse_product_request(
116-
self, request: Request, callback: Optional[Callable] = None
122+
self, request: ProbabilityRequest, callback: Optional[Callable] = None
117123
) -> scrapy.Request:
118124
callback = callback or self.parse_product
119125
return request.to_scrapy(

zyte_spider_templates/spiders/ecommerce.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def parse_navigation(
141141
if navigation.nextPage:
142142
yield self.get_parse_navigation_request(
143143
navigation.nextPage,
144-
priority=self.ITEM_REQUEST_PRIORITY - 1,
144+
priority=self._NEXT_PAGE_PRIORITY,
145145
)
146146

147147
if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only:

0 commit comments

Comments
 (0)