diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fbcf2ac..8b1c908 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.9.0 +current_version = 0.10.0 commit = True tag = True tag_name = {new_version} diff --git a/CHANGES.rst b/CHANGES.rst index 019e11c..50ff9d1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,47 @@ Changes ======= +0.10.0 (2024-11-22) +------------------- + +* Dropped Python 3.8 support, added Python 3.13 support. + +* Increased the minimum required versions of some dependencies: + + * ``pydantic``: ``2`` → ``2.1`` + + * ``scrapy-poet``: ``0.21.0`` → ``0.24.0`` + + * ``scrapy-spider-metadata``: ``0.1.2`` → ``0.2.0`` + + * ``scrapy-zyte-api[provider]``: ``0.16.0`` → ``0.23.0`` + + * ``zyte-common-items``: ``0.22.0`` → ``0.23.0`` + +* Added :ref:`custom attributes ` support to the + :ref:`e-commerce spider template ` through its new + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_input` + and + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_method` + parameters. + +* The + :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams.max_pages` + parameter of the :ref:`Google Search spider template ` can no + longer be 0 or lower. + +* The :ref:`Google Search spider template ` now follows + pagination for the results of each query page by page, instead of sending a + request for every page in parallel. It stops once it reaches a page without + organic results. + +* Improved the description of + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy` + values. + +* Fixed type hint issues related to Scrapy. + + 0.9.0 (2024-09-17) ------------------ diff --git a/docs/conf.py b/docs/conf.py index 51c9a45..0d0a14e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,7 +4,7 @@ project = "zyte-spider-templates" copyright = "2023, Zyte Group Ltd" author = "Zyte Group Ltd" -release = "0.9.0" +release = "0.10.0" sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext extensions = [ @@ -54,6 +54,10 @@ "https://web-poet.readthedocs.io/en/stable", None, ), + "zyte": ( + "https://docs.zyte.com", + None, + ), "zyte-common-items": ( "https://zyte-common-items.readthedocs.io/en/latest", None, @@ -65,6 +69,7 @@ autodoc_pydantic_model_show_json = False autodoc_pydantic_model_show_validator_members = False autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False # sphinx-reredirects redirects = { diff --git a/docs/reference/index.rst b/docs/reference/index.rst index dd368dd..d623779 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -23,6 +23,14 @@ Pages Parameter mixins ================ +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.params.CustomAttrsMethod + .. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam :exclude-members: model_computed_fields diff --git a/setup.py b/setup.py index cead135..cf48be0 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="zyte-spider-templates", - version="0.9.0", + version="0.10.0", description="Spider templates for automatic crawlers.", long_description=open("README.rst").read(), long_description_content_type="text/x-rst", diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index f79bdf7..21d9ee4 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -12,7 +12,6 @@ ProbabilityRequest, Product, ProductNavigation, - Request, SearchRequestTemplate, SearchRequestTemplateMetadata, ) @@ -405,7 +404,7 @@ def test_arguments(): spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs) getter = getattr(crawler.settings, getter_name) assert getter(setting) == new_setting_value - assert spider.allowed_domains == ["example.com"] + assert spider.allowed_domains == ["example.com"] # type: ignore[attr-defined] def test_metadata(): @@ -666,7 +665,7 @@ def test_get_subcategory_request(): url = "https://example.com" # Normal request but with mostly empty values - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -737,7 +736,7 @@ def test_get_nextpage_request(): url = "https://example.com" # Minimal Args - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -756,7 +755,7 @@ def test_get_parse_navigation_request(): url = "https://example.com" # Minimal args - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -781,7 +780,7 @@ def test_set_allowed_domains(url, allowed_domain): kwargs = {"url": url} spider = EcommerceSpider.from_crawler(crawler, **kwargs) - assert spider.allowed_domains == [allowed_domain] + assert spider.allowed_domains == [allowed_domain] # type: ignore[attr-defined] def test_input_none(): diff --git a/tests/test_serp.py b/tests/test_serp.py index 0bef96e..699fee5 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -1,6 +1,9 @@ import pytest from pydantic import ValidationError +from scrapy import Request from scrapy_spider_metadata import get_spider_metadata +from scrapy_zyte_api.responses import ZyteAPITextResponse +from w3lib.url import add_or_replace_parameter from zyte_spider_templates.spiders.serp import GoogleSearchSpider @@ -312,3 +315,145 @@ def test_search_queries(): assert len(requests) == 2 assert requests[0].url == "https://www.google.com/search?q=foo+bar" assert requests[1].url == "https://www.google.com/search?q=baz" + + +def test_pagination(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=3 + ) + + def run_parse_serp(total_results, page=1): + url = "https://www.google.com/search?q=foo+bar" + if page > 1: + url = add_or_replace_parameter(url, "start", (page - 1) * 10) + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": total_results, + }, + "pageNumber": page, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response, page_number=page): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + return items, requests + + items, requests = run_parse_serp( + total_results=10, + ) + assert len(items) == 1 + assert len(requests) == 0 + + items, requests = run_parse_serp( + total_results=11, + ) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=10" + assert requests[0].cb_kwargs["page_number"] == 2 + + items, requests = run_parse_serp( + total_results=20, + page=2, + ) + assert len(items) == 1 + assert len(requests) == 0 + + items, requests = run_parse_serp( + total_results=21, + page=2, + ) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" + assert requests[0].cb_kwargs["page_number"] == 3 + + # Do not go over max_pages + items, requests = run_parse_serp( + total_results=31, + page=3, + ) + assert len(items) == 1 + assert len(requests) == 0 + + +def test_get_serp_request(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + url = "https://www.google.com/search?q=foo+bar" + + request = spider.get_serp_request(url, page_number=42) + assert request.cb_kwargs["page_number"] == 42 + + # The page_number parameter is required. + with pytest.raises(TypeError): + spider.get_serp_request(url) # type: ignore[call-arg] + + +def test_parse_serp(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43 + ) + url = "https://www.google.com/search?q=foo+bar" + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": 99999, + }, + "pageNumber": 1, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response, page_number=42): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == add_or_replace_parameter(url, "start", "420") + assert requests[0].cb_kwargs["page_number"] == 43 + + # The page_number parameter is required. + with pytest.raises(TypeError): + spider.parse_serp(response) # type: ignore[call-arg] diff --git a/zyte_spider_templates/pages/product_navigation_heuristics.py b/zyte_spider_templates/pages/product_navigation_heuristics.py index bd012ff..fd2a8ae 100644 --- a/zyte_spider_templates/pages/product_navigation_heuristics.py +++ b/zyte_spider_templates/pages/product_navigation_heuristics.py @@ -45,7 +45,7 @@ def _probably_category_links(self) -> List[ProbabilityRequest]: default_probability = 0.1 link_extractor = LinkExtractor( - allow_domains=self.page_params.get("full_domain") + allow_domains=self.page_params.get("full_domain", []) ) ignore_urls = set(self._urls_for_category()) diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index 3491b61..0500ac8 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -189,7 +189,7 @@ def validate_input_group(model): class UrlsFileParam(BaseModel): - urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] + urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] @model_validator(mode="after") def input_group(self): @@ -227,7 +227,7 @@ def parse_input_params(spider): class UrlParam(BaseModel): - url: str = Field(**URL_FIELD_KWARGS) # type: ignore[misc, arg-type] + url: str = Field(**URL_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] URLS_FIELD_KWARGS = { @@ -281,7 +281,7 @@ def input_group(self): class UrlsParam(BaseModel): - urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] + urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] @model_validator(mode="after") def input_group(self): diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index 02e2510..b4de089 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from importlib.metadata import version -from typing import Annotated, Any, Dict +from typing import TYPE_CHECKING, Annotated, Any, Dict from warnings import warn import scrapy @@ -19,6 +21,11 @@ UrlsParam, ) +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + + # Higher priority than command-line-defined settings (40). ARG_SETTING_PRIORITY: int = 50 @@ -55,7 +62,7 @@ def deprecated(self): class BaseSpider(scrapy.Spider): - custom_settings: Dict[str, Any] = { + custom_settings: Dict[str, Any] = { # type: ignore[assignment] "ZYTE_API_TRANSPARENT_MODE": True, "_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}", } @@ -71,9 +78,13 @@ class BaseSpider(scrapy.Spider): _custom_attrs_dep = None @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super().from_crawler(crawler, *args, **kwargs) + # all subclasses of this need to also have Args as a subclass + # this may be possible to express in type hints instead + assert hasattr(spider, "args") + if geolocation := getattr(spider.args, "geolocation", None): # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index dff285f..586c364 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,9 +1,10 @@ +from __future__ import annotations + from enum import Enum -from typing import Any, Callable, Dict, Iterable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast import scrapy from pydantic import BaseModel, ConfigDict, Field, model_validator -from scrapy import Request from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args @@ -38,6 +39,10 @@ UrlsParam, ) +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + @document_enum class EcommerceCrawlStrategy(str, Enum): @@ -198,7 +203,7 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): } @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs) parse_input_params(spider) spider._init_extract_from() @@ -222,7 +227,7 @@ def get_start_request(self, url): if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item else self.parse_navigation ) - meta = { + meta: Dict[str, Any] = { "crawling_logs": { "page_type": "product" if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item @@ -252,13 +257,13 @@ def get_start_request(self, url): f"Heuristics won't be used to crawl other pages which might have products." ) - return Request( + return scrapy.Request( url=url, callback=callback, meta=meta, ) - def start_requests(self) -> Iterable[Request]: + def start_requests(self) -> Iterable[scrapy.Request]: if self.args.search_queries: for url in self.start_urls: meta: Dict[str, Any] = { @@ -266,7 +271,7 @@ def start_requests(self) -> Iterable[Request]: } if self.args.extract_from == ExtractFrom.browserHtml: meta["inject"] = [BrowserResponse] - yield Request( + yield scrapy.Request( url=url, callback=self.parse_search_request_template, meta=meta, @@ -280,7 +285,7 @@ def parse_search_request_template( response: DummyResponse, search_request_template: SearchRequestTemplate, dynamic: DynamicDeps, - ) -> Iterable[Request]: + ) -> Iterable[scrapy.Request]: probability = search_request_template.get_probability() if probability is not None and probability <= 0: return @@ -294,7 +299,7 @@ def parse_search_request_template( def parse_navigation( self, response: DummyResponse, navigation: ProductNavigation - ) -> Iterable[Request]: + ) -> Iterable[scrapy.Request]: page_params = self._modify_page_params_for_heuristics( response.meta.get("page_params") ) @@ -310,7 +315,9 @@ def parse_navigation( f"are no product links found in {navigation.url}" ) else: - yield self.get_nextpage_request(navigation.nextPage) + yield self.get_nextpage_request( + cast(ProbabilityRequest, navigation.nextPage) + ) if ( self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only @@ -336,6 +343,7 @@ def parse_product( else: yield product else: + assert self.crawler.stats self.crawler.stats.inc_value("drop_item/product/low_probability") self.logger.info( f"Ignoring item from {response.url} since its probability is " @@ -343,9 +351,7 @@ def parse_product( ) @staticmethod - def get_parse_navigation_request_priority( - request: Union[ProbabilityRequest, Request] - ) -> int: + def get_parse_navigation_request_priority(request: ProbabilityRequest) -> int: if ( not hasattr(request, "metadata") or not request.metadata @@ -356,7 +362,7 @@ def get_parse_navigation_request_priority( def get_parse_navigation_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -379,7 +385,7 @@ def get_parse_navigation_request( def get_subcategory_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -401,7 +407,7 @@ def get_subcategory_request( def get_nextpage_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, ): @@ -420,7 +426,7 @@ def get_parse_product_request( priority = self.get_parse_product_request_priority(request) probability = request.get_probability() - meta = { + meta: Dict[str, Any] = { "crawling_logs": { "name": request.name, "probability": probability, diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index b30750c..ed0d1e7 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -76,6 +76,7 @@ class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): """ name = "google_search" + _results_per_page = 10 metadata: Dict[str, Any] = { **BaseSpider.metadata, @@ -97,10 +98,13 @@ def update_settings(cls, settings: BaseSettings) -> None: priority="spider", ) - def get_start_request(self, url): + def get_serp_request(self, url: str, *, page_number: int): return Request( url=url, callback=self.parse_serp, + cb_kwargs={ + "page_number": page_number, + }, meta={ "crawling_logs": {"page_type": "serp"}, "zyte_api": { @@ -117,12 +121,15 @@ def start_requests(self) -> Iterable[Request]: url = f"https://www.{self.args.domain.value}/search" for search_query in search_queries: search_url = add_or_replace_parameter(url, "q", search_query) - for start in range(0, self.args.max_pages * 10, 10): - if start: - search_url = add_or_replace_parameter( - search_url, "start", str(start) - ) - yield self.get_start_request(search_url) - - def parse_serp(self, response) -> Iterable[Serp]: - yield Serp.from_dict(response.raw_api_response["serp"]) + yield self.get_serp_request(search_url, page_number=1) + + def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: + serp = Serp.from_dict(response.raw_api_response["serp"]) + + if page_number < self.args.max_pages: + next_start = page_number * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) + + yield serp