From ea78545410a14c36964ac7da9f492d89233cc86e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 25 Oct 2024 10:50:09 +0200 Subject: [PATCH 01/17] Follow next pages one by one --- zyte_spider_templates/spiders/serp.py | 37 ++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index b30750c..1e3a5c7 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,4 +1,5 @@ from typing import Any, Dict, Iterable, List, Optional, Union +from warnings import warn from pydantic import BaseModel, Field, field_validator from scrapy import Request @@ -98,6 +99,17 @@ def update_settings(cls, settings: BaseSettings) -> None: ) def get_start_request(self, url): + warn( + ( + "zyte_spider_templates.spiders.serp.GoogleSearchSpider.get_start_request " + "is deprecated, use get_serp_request instead." + ), + DeprecationWarning, + stacklevel=2, + ) + return self.get_serp_request(url) + + def get_serp_request(self, url): return Request( url=url, callback=self.parse_serp, @@ -117,12 +129,23 @@ def start_requests(self) -> Iterable[Request]: url = f"https://www.{self.args.domain.value}/search" for search_query in search_queries: search_url = add_or_replace_parameter(url, "q", search_query) - for start in range(0, self.args.max_pages * 10, 10): - if start: - search_url = add_or_replace_parameter( - search_url, "start", str(start) - ) - yield self.get_start_request(search_url) + yield self.get_serp_request(search_url) def parse_serp(self, response) -> Iterable[Serp]: - yield Serp.from_dict(response.raw_api_response["serp"]) + serp = Serp.from_dict(response.raw_api_response["serp"]) + + if serp.organicResults is not None: + # NOTE: We could skip the next page as long as there are fewer than + # 10 results in the current page, but it seems Google can be + # unreliable when it comes to the number of organic results it + # returns, so this approach is safer, at the risk of 1 extra + # request. + # + # In the future, Serp should indicate the next page URL, and this + # should become a non-issue. + next_url = add_or_replace_parameter( + serp.url, "start", str(serp.pageNumber * 10) + ) + yield self.get_serp_request(next_url) + + yield serp From ffe781e049fe7a54c672779fbe84db2e84c31703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 25 Oct 2024 11:09:48 +0200 Subject: [PATCH 02/17] Do not follow up if serp.organicResults is [] --- zyte_spider_templates/spiders/serp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 1e3a5c7..e675e8d 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -134,7 +134,7 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response) -> Iterable[Serp]: serp = Serp.from_dict(response.raw_api_response["serp"]) - if serp.organicResults is not None: + if serp.organicResults: # NOTE: We could skip the next page as long as there are fewer than # 10 results in the current page, but it seems Google can be # unreliable when it comes to the number of organic results it From a0ff1daacfae1ffb92c4d07fd1877e67a119c5ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 25 Oct 2024 11:24:00 +0200 Subject: [PATCH 03/17] Minimize pagination based on totalOrganicResults --- zyte_spider_templates/spiders/serp.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index e675e8d..801a918 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -77,6 +77,7 @@ class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): """ name = "google_search" + _results_per_page = 10 metadata: Dict[str, Any] = { **BaseSpider.metadata, @@ -134,18 +135,9 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response) -> Iterable[Serp]: serp = Serp.from_dict(response.raw_api_response["serp"]) - if serp.organicResults: - # NOTE: We could skip the next page as long as there are fewer than - # 10 results in the current page, but it seems Google can be - # unreliable when it comes to the number of organic results it - # returns, so this approach is safer, at the risk of 1 extra - # request. - # - # In the future, Serp should indicate the next page URL, and this - # should become a non-issue. - next_url = add_or_replace_parameter( - serp.url, "start", str(serp.pageNumber * 10) - ) + next_start = serp.pageNumber * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults <= next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) yield self.get_serp_request(next_url) yield serp From 977cfb44e0f586bbacd30126972be4d36149b95c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 25 Oct 2024 11:26:13 +0200 Subject: [PATCH 04/17] Fix if --- zyte_spider_templates/spiders/serp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 801a918..b1eb1f1 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -136,7 +136,7 @@ def parse_serp(self, response) -> Iterable[Serp]: serp = Serp.from_dict(response.raw_api_response["serp"]) next_start = serp.pageNumber * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults <= next_start: + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) yield self.get_serp_request(next_url) From 952afd7acbf00445e25297725de6fbbd9a4e4092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 25 Oct 2024 11:55:52 +0200 Subject: [PATCH 05/17] Add a test --- tests/test_serp.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/test_serp.py b/tests/test_serp.py index 0bef96e..813eef2 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -1,6 +1,9 @@ import pytest from pydantic import ValidationError +from scrapy import Request from scrapy_spider_metadata import get_spider_metadata +from scrapy_zyte_api.responses import ZyteAPITextResponse +from w3lib.url import add_or_replace_parameter from zyte_spider_templates.spiders.serp import GoogleSearchSpider @@ -312,3 +315,73 @@ def test_search_queries(): assert len(requests) == 2 assert requests[0].url == "https://www.google.com/search?q=foo+bar" assert requests[1].url == "https://www.google.com/search?q=baz" + + +def test_pagination(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + + def run_parse_serp(total_results, page=1): + url = "https://www.google.com/search?q=foo+bar" + if page > 1: + url = add_or_replace_parameter(url, "start", (page - 1) * 10) + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": total_results, + }, + "pageNumber": page, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + return items, requests + + items, requests = run_parse_serp( + total_results=10, + ) + assert len(items) == 1 + assert len(requests) == 0 + + items, requests = run_parse_serp( + total_results=11, + ) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=10" + + items, requests = run_parse_serp( + total_results=20, + page=2, + ) + assert len(items) == 1 + assert len(requests) == 0 + + items, requests = run_parse_serp( + total_results=21, + page=2, + ) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" From d3593fa81f27737269032b5ab037138b24cf2a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 30 Oct 2024 10:42:46 +0100 Subject: [PATCH 06/17] GoogleSearchSpider: pass the page number around --- tests/test_serp.py | 2 +- zyte_spider_templates/spiders/serp.py | 33 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 813eef2..be6a632 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -351,7 +351,7 @@ def run_parse_serp(total_results, page=1): ) items = [] requests = [] - for item_or_request in spider.parse_serp(response): + for item_or_request in spider.parse_serp(response, page_number=page): if isinstance(item_or_request, Request): requests.append(item_or_request) else: diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index b1eb1f1..2a1b8d7 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -12,6 +12,8 @@ from ._google_domains import GoogleDomain from .base import BaseSpider +_UNSET = object() + class SearchQueriesParam(BaseModel): search_queries: Optional[List[str]] = Field( @@ -110,10 +112,13 @@ def get_start_request(self, url): ) return self.get_serp_request(url) - def get_serp_request(self, url): + def get_serp_request(self, url, *, page_number=1): return Request( url=url, callback=self.parse_serp, + cb_kwargs={ + "page_number": page_number, + }, meta={ "crawling_logs": {"page_type": "serp"}, "zyte_api": { @@ -132,12 +137,28 @@ def start_requests(self) -> Iterable[Request]: search_url = add_or_replace_parameter(url, "q", search_query) yield self.get_serp_request(search_url) - def parse_serp(self, response) -> Iterable[Serp]: + def parse_serp(self, response, page_number=_UNSET) -> Iterable[Serp]: serp = Serp.from_dict(response.raw_api_response["serp"]) - next_start = serp.pageNumber * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: - next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) - yield self.get_serp_request(next_url) + if page_number is not _UNSET: + next_start = page_number * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) + else: + warn( + ( + "zyte_spider_templates.spiders.serp.GoogleSearchSpider.parse_serp " + "now expects a page_number callback keyword argument " + "(cb_kwargs) indicating the page being parsed (e.g. 1 for " + "the 1st page), and will yield a request for the next " + "page if the next page could have additional results, " + "instead of expecting the source callback to take care of " + "pagination by yielding requests for multiple pages in " + "parallel." + ), + DeprecationWarning, + stacklevel=2, + ) yield serp From 47500bf3bfb34ac12e74f3106c7c117c40a991d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 30 Oct 2024 11:00:37 +0100 Subject: [PATCH 07/17] Make page_number a required parameter, to prevent accidental use of 1 with the wrong URL --- zyte_spider_templates/spiders/serp.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 2a1b8d7..c3fecd6 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -5,7 +5,7 @@ from scrapy import Request from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings from scrapy_spider_metadata import Args -from w3lib.url import add_or_replace_parameter +from w3lib.url import add_or_replace_parameter, url_query_parameter from zyte_common_items import Serp from ..params import MaxRequestsParam @@ -110,9 +110,10 @@ def get_start_request(self, url): DeprecationWarning, stacklevel=2, ) - return self.get_serp_request(url) + page_number = int(url_query_parameter(url, "start", 0)) / 10 + 1 + return self.get_serp_request(url, page_number=page_number) - def get_serp_request(self, url, *, page_number=1): + def get_serp_request(self, url, *, page_number): return Request( url=url, callback=self.parse_serp, @@ -135,7 +136,7 @@ def start_requests(self) -> Iterable[Request]: url = f"https://www.{self.args.domain.value}/search" for search_query in search_queries: search_url = add_or_replace_parameter(url, "q", search_query) - yield self.get_serp_request(search_url) + yield self.get_serp_request(search_url, page_number=1) def parse_serp(self, response, page_number=_UNSET) -> Iterable[Serp]: serp = Serp.from_dict(response.raw_api_response["serp"]) From 22d2a3dd0e0fb6abfcbafa2228b509c5525ccf27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 30 Oct 2024 11:32:46 +0100 Subject: [PATCH 08/17] Improve test coverage --- tests/test_serp.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/test_serp.py b/tests/test_serp.py index be6a632..7aa215b 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -370,6 +370,7 @@ def run_parse_serp(total_results, page=1): assert len(items) == 1 assert len(requests) == 1 assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=10" + assert requests[0].cb_kwargs["page_number"] == 2 items, requests = run_parse_serp( total_results=20, @@ -385,3 +386,91 @@ def run_parse_serp(total_results, page=1): assert len(items) == 1 assert len(requests) == 1 assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" + assert requests[0].cb_kwargs["page_number"] == 3 + + +@pytest.mark.parametrize( + ("start", "page_number"), + ( + (None, 1), + (0, 1), + (10, 2), + (20, 3), + ), +) +def test_get_start_request(start, page_number): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + url = "https://www.google.com/search?q=foo+bar" + if start is not None: + url = add_or_replace_parameter(url, "start", str(start)) + with pytest.deprecated_call(): + request = spider.get_start_request(url) + assert request.cb_kwargs["page_number"] == page_number + + +def test_get_serp_request(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + url = "https://www.google.com/search?q=foo+bar" + + request = spider.get_serp_request(url, page_number=42) + assert request.cb_kwargs["page_number"] == 42 + + # The page_number parameter is required. + with pytest.raises(TypeError): + spider.get_serp_request(url) + + +def test_parse_serp(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + url = "https://www.google.com/search?q=foo+bar" + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": 99999, + }, + "pageNumber": 1, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response, page_number=42): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == add_or_replace_parameter(url, "start", "420") + assert requests[0].cb_kwargs["page_number"] == 43 + + # If the page_number parameter is missing, a deprecation warning is logged, + # and no follow-up request is yielded. + requests = [] + items = [] + with pytest.deprecated_call(): + for item_or_request in spider.parse_serp(response): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + assert len(items) == 1 + assert len(requests) == 0 From 05a5a474cd23eb06ce1d5bcc04599f476e0c4ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 30 Oct 2024 11:40:17 +0100 Subject: [PATCH 09/17] Keep mypy happy --- zyte_spider_templates/spiders/serp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index c3fecd6..8a07f57 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -101,7 +101,7 @@ def update_settings(cls, settings: BaseSettings) -> None: priority="spider", ) - def get_start_request(self, url): + def get_start_request(self, url: str): warn( ( "zyte_spider_templates.spiders.serp.GoogleSearchSpider.get_start_request " @@ -110,10 +110,10 @@ def get_start_request(self, url): DeprecationWarning, stacklevel=2, ) - page_number = int(url_query_parameter(url, "start", 0)) / 10 + 1 + page_number = int(int(url_query_parameter(url, "start", "0")) / 10 + 1) return self.get_serp_request(url, page_number=page_number) - def get_serp_request(self, url, *, page_number): + def get_serp_request(self, url: str, *, page_number: int): return Request( url=url, callback=self.parse_serp, From f0f40cc208aebe97868b84153ae2e08e87b4d238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 30 Oct 2024 12:54:40 +0100 Subject: [PATCH 10/17] Remove deprecations, break backward compatibility instead --- tests/test_serp.py | 35 ++-------------------- zyte_spider_templates/spiders/serp.py | 43 ++++----------------------- 2 files changed, 9 insertions(+), 69 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 7aa215b..92b19d2 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -389,26 +389,6 @@ def run_parse_serp(total_results, page=1): assert requests[0].cb_kwargs["page_number"] == 3 -@pytest.mark.parametrize( - ("start", "page_number"), - ( - (None, 1), - (0, 1), - (10, 2), - (20, 3), - ), -) -def test_get_start_request(start, page_number): - crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") - url = "https://www.google.com/search?q=foo+bar" - if start is not None: - url = add_or_replace_parameter(url, "start", str(start)) - with pytest.deprecated_call(): - request = spider.get_start_request(url) - assert request.cb_kwargs["page_number"] == page_number - - def test_get_serp_request(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") @@ -462,15 +442,6 @@ def test_parse_serp(): assert requests[0].url == add_or_replace_parameter(url, "start", "420") assert requests[0].cb_kwargs["page_number"] == 43 - # If the page_number parameter is missing, a deprecation warning is logged, - # and no follow-up request is yielded. - requests = [] - items = [] - with pytest.deprecated_call(): - for item_or_request in spider.parse_serp(response): - if isinstance(item_or_request, Request): - requests.append(item_or_request) - else: - items.append(item_or_request) - assert len(items) == 1 - assert len(requests) == 0 + # The page_number parameter is required. + with pytest.raises(TypeError): + spider.parse_serp(response) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 8a07f57..d7d7dc9 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,19 +1,16 @@ from typing import Any, Dict, Iterable, List, Optional, Union -from warnings import warn from pydantic import BaseModel, Field, field_validator from scrapy import Request from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings from scrapy_spider_metadata import Args -from w3lib.url import add_or_replace_parameter, url_query_parameter +from w3lib.url import add_or_replace_parameter from zyte_common_items import Serp from ..params import MaxRequestsParam from ._google_domains import GoogleDomain from .base import BaseSpider -_UNSET = object() - class SearchQueriesParam(BaseModel): search_queries: Optional[List[str]] = Field( @@ -101,18 +98,6 @@ def update_settings(cls, settings: BaseSettings) -> None: priority="spider", ) - def get_start_request(self, url: str): - warn( - ( - "zyte_spider_templates.spiders.serp.GoogleSearchSpider.get_start_request " - "is deprecated, use get_serp_request instead." - ), - DeprecationWarning, - stacklevel=2, - ) - page_number = int(int(url_query_parameter(url, "start", "0")) / 10 + 1) - return self.get_serp_request(url, page_number=page_number) - def get_serp_request(self, url: str, *, page_number: int): return Request( url=url, @@ -138,28 +123,12 @@ def start_requests(self) -> Iterable[Request]: search_url = add_or_replace_parameter(url, "q", search_query) yield self.get_serp_request(search_url, page_number=1) - def parse_serp(self, response, page_number=_UNSET) -> Iterable[Serp]: + def parse_serp(self, response, page_number) -> Iterable[Serp]: serp = Serp.from_dict(response.raw_api_response["serp"]) - if page_number is not _UNSET: - next_start = page_number * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: - next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) - yield self.get_serp_request(next_url, page_number=page_number + 1) - else: - warn( - ( - "zyte_spider_templates.spiders.serp.GoogleSearchSpider.parse_serp " - "now expects a page_number callback keyword argument " - "(cb_kwargs) indicating the page being parsed (e.g. 1 for " - "the 1st page), and will yield a request for the next " - "page if the next page could have additional results, " - "instead of expecting the source callback to take care of " - "pagination by yielding requests for multiple pages in " - "parallel." - ), - DeprecationWarning, - stacklevel=2, - ) + next_start = page_number * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) yield serp From ce415fb6c34458decb3a3227d91cfa831503476a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 31 Oct 2024 09:32:45 +0100 Subject: [PATCH 11/17] Fix typing --- zyte_spider_templates/spiders/serp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index d7d7dc9..dea6922 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -123,7 +123,7 @@ def start_requests(self) -> Iterable[Request]: search_url = add_or_replace_parameter(url, "q", search_query) yield self.get_serp_request(search_url, page_number=1) - def parse_serp(self, response, page_number) -> Iterable[Serp]: + def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) next_start = page_number * self._results_per_page From c3ad4d1fa5eff2d4554da3e7253cd14786a29ba2 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 19 Nov 2024 19:11:59 +0500 Subject: [PATCH 12/17] Fix typing issues with typed Scrapy. --- tests/test_ecommerce.py | 12 +++---- tests/test_serp.py | 4 +-- .../pages/product_navigation_heuristics.py | 2 +- zyte_spider_templates/spiders/base.py | 17 ++++++++-- zyte_spider_templates/spiders/ecommerce.py | 34 ++++++++++++------- 5 files changed, 44 insertions(+), 25 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 69d9466..ee0f271 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -7,7 +7,7 @@ from pydantic import ValidationError from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata -from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request +from zyte_common_items import ProbabilityRequest, Product, ProductNavigation from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, @@ -357,7 +357,7 @@ def test_arguments(): spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs) getter = getattr(crawler.settings, getter_name) assert getter(setting) == new_setting_value - assert spider.allowed_domains == ["example.com"] + assert spider.allowed_domains == ["example.com"] # type: ignore[attr-defined] def test_metadata(): @@ -607,7 +607,7 @@ def test_get_subcategory_request(): url = "https://example.com" # Normal request but with mostly empty values - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -678,7 +678,7 @@ def test_get_nextpage_request(): url = "https://example.com" # Minimal Args - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -697,7 +697,7 @@ def test_get_parse_navigation_request(): url = "https://example.com" # Minimal args - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -722,7 +722,7 @@ def test_set_allowed_domains(url, allowed_domain): kwargs = {"url": url} spider = EcommerceSpider.from_crawler(crawler, **kwargs) - assert spider.allowed_domains == [allowed_domain] + assert spider.allowed_domains == [allowed_domain] # type: ignore[attr-defined] def test_input_none(): diff --git a/tests/test_serp.py b/tests/test_serp.py index 92b19d2..3aabfe9 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -399,7 +399,7 @@ def test_get_serp_request(): # The page_number parameter is required. with pytest.raises(TypeError): - spider.get_serp_request(url) + spider.get_serp_request(url) # type: ignore[call-arg] def test_parse_serp(): @@ -444,4 +444,4 @@ def test_parse_serp(): # The page_number parameter is required. with pytest.raises(TypeError): - spider.parse_serp(response) + spider.parse_serp(response) # type: ignore[call-arg] diff --git a/zyte_spider_templates/pages/product_navigation_heuristics.py b/zyte_spider_templates/pages/product_navigation_heuristics.py index bd012ff..fd2a8ae 100644 --- a/zyte_spider_templates/pages/product_navigation_heuristics.py +++ b/zyte_spider_templates/pages/product_navigation_heuristics.py @@ -45,7 +45,7 @@ def _probably_category_links(self) -> List[ProbabilityRequest]: default_probability = 0.1 link_extractor = LinkExtractor( - allow_domains=self.page_params.get("full_domain") + allow_domains=self.page_params.get("full_domain", []) ) ignore_urls = set(self._urls_for_category()) diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index e6e78f4..11e4acf 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from importlib.metadata import version -from typing import Annotated, Any, Dict +from typing import TYPE_CHECKING, Annotated, Any, Dict from warnings import warn import scrapy @@ -18,6 +20,11 @@ UrlsParam, ) +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + + # Higher priority than command-line-defined settings (40). ARG_SETTING_PRIORITY: int = 50 @@ -52,7 +59,7 @@ def deprecated(self): class BaseSpider(scrapy.Spider): - custom_settings: Dict[str, Any] = { + custom_settings: Dict[str, Any] = { # type: ignore[assignment] "ZYTE_API_TRANSPARENT_MODE": True, "_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}", } @@ -68,9 +75,13 @@ class BaseSpider(scrapy.Spider): _custom_attrs_dep = None @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super().from_crawler(crawler, *args, **kwargs) + # all subclasses of this need to also have Args as a subclass + # this may be possible to express in type hints instead + assert hasattr(spider, "args") + if geolocation := getattr(spider.args, "geolocation", None): # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 5e87266..ca41a63 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,9 +1,10 @@ +from __future__ import annotations + from enum import Enum -from typing import Any, Callable, Dict, Iterable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast import scrapy from pydantic import BaseModel, ConfigDict, Field -from scrapy import Request from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args @@ -35,6 +36,10 @@ UrlsParam, ) +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + @document_enum class EcommerceCrawlStrategy(str, Enum): @@ -180,7 +185,7 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): } @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs) parse_input_params(spider) spider._init_extract_from() @@ -204,7 +209,7 @@ def get_start_request(self, url): if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item else self.parse_navigation ) - meta = { + meta: Dict[str, Any] = { "crawling_logs": { "page_type": "product" if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item @@ -234,19 +239,19 @@ def get_start_request(self, url): f"Heuristics won't be used to crawl other pages which might have products." ) - return Request( + return scrapy.Request( url=url, callback=callback, meta=meta, ) - def start_requests(self) -> Iterable[Request]: + def start_requests(self) -> Iterable[scrapy.Request]: for url in self.start_urls: yield self.get_start_request(url) def parse_navigation( self, response: DummyResponse, navigation: ProductNavigation - ) -> Iterable[Request]: + ) -> Iterable[scrapy.Request]: page_params = self._modify_page_params_for_heuristics( response.meta.get("page_params") ) @@ -262,7 +267,9 @@ def parse_navigation( f"are no product links found in {navigation.url}" ) else: - yield self.get_nextpage_request(navigation.nextPage) + yield self.get_nextpage_request( + cast(ProbabilityRequest, navigation.nextPage) + ) if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only: for request in navigation.subCategories or []: @@ -285,6 +292,7 @@ def parse_product( else: yield product else: + assert self.crawler.stats self.crawler.stats.inc_value("drop_item/product/low_probability") self.logger.info( f"Ignoring item from {response.url} since its probability is " @@ -293,7 +301,7 @@ def parse_product( @staticmethod def get_parse_navigation_request_priority( - request: Union[ProbabilityRequest, Request] + request: Union[ProbabilityRequest, scrapy.Request] ) -> int: if ( not hasattr(request, "metadata") @@ -305,7 +313,7 @@ def get_parse_navigation_request_priority( def get_parse_navigation_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -328,7 +336,7 @@ def get_parse_navigation_request( def get_subcategory_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -350,7 +358,7 @@ def get_subcategory_request( def get_nextpage_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, ): @@ -369,7 +377,7 @@ def get_parse_product_request( priority = self.get_parse_product_request_priority(request) probability = request.get_probability() - meta = { + meta: Dict[str, Any] = { "crawling_logs": { "name": request.name, "probability": probability, From eaf6453524cf08cf4ee9ee9563fe864c1d0a8670 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 19 Nov 2024 19:14:14 +0500 Subject: [PATCH 13/17] Simplify the get_parse_navigation_request_priority type hint. --- zyte_spider_templates/spiders/ecommerce.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index ca41a63..db3a5b0 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -300,9 +300,7 @@ def parse_product( ) @staticmethod - def get_parse_navigation_request_priority( - request: Union[ProbabilityRequest, scrapy.Request] - ) -> int: + def get_parse_navigation_request_priority(request: ProbabilityRequest) -> int: if ( not hasattr(request, "metadata") or not request.metadata From 728a3a9483830f894fc810bbfd6d66873c186b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Nov 2024 11:13:38 +0100 Subject: [PATCH 14/17] Fix max_pages --- tests/test_serp.py | 16 ++++++++++++++-- zyte_spider_templates/spiders/serp.py | 9 +++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 3aabfe9..699fee5 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -319,7 +319,9 @@ def test_search_queries(): def test_pagination(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=3 + ) def run_parse_serp(total_results, page=1): url = "https://www.google.com/search?q=foo+bar" @@ -388,6 +390,14 @@ def run_parse_serp(total_results, page=1): assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" assert requests[0].cb_kwargs["page_number"] == 3 + # Do not go over max_pages + items, requests = run_parse_serp( + total_results=31, + page=3, + ) + assert len(items) == 1 + assert len(requests) == 0 + def test_get_serp_request(): crawler = get_crawler() @@ -404,7 +414,9 @@ def test_get_serp_request(): def test_parse_serp(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43 + ) url = "https://www.google.com/search?q=foo+bar" response = ZyteAPITextResponse.from_api_response( api_response={ diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index dea6922..ed0d1e7 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -126,9 +126,10 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) - next_start = page_number * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: - next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) - yield self.get_serp_request(next_url, page_number=page_number + 1) + if page_number < self.args.max_pages: + next_start = page_number * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) yield serp From 3a7670a0c51b70792d4938d12761549542fc731a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 22 Nov 2024 11:30:57 +0100 Subject: [PATCH 15/17] Release notes for 0.10.0 (#86) --- CHANGES.rst | 41 +++++++++++++++++++++++++++++++++ docs/conf.py | 5 ++++ docs/reference/index.rst | 8 +++++++ zyte_spider_templates/params.py | 6 ++--- 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 019e11c..471a1cb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,47 @@ Changes ======= +0.10.0 (2024-11-DD) +------------------- + +* Dropped Python 3.8 support, added Python 3.13 support. + +* Increased the minimum required versions of some dependencies: + + * ``pydantic``: ``2`` → ``2.1`` + + * ``scrapy-poet``: ``0.21.0`` → ``0.24.0`` + + * ``scrapy-spider-metadata``: ``0.1.2`` → ``0.2.0`` + + * ``scrapy-zyte-api[provider]``: ``0.16.0`` → ``0.23.0`` + + * ``zyte-common-items``: ``0.22.0`` → ``0.23.0`` + +* Added :ref:`custom attributes ` support to the + :ref:`e-commerce spider template ` through its new + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_input` + and + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_method` + parameters. + +* The + :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams.max_pages` + parameter of the :ref:`Google Search spider template ` can no + longer be 0 or lower. + +* The :ref:`Google Search spider template ` now follows + pagination for the results of each query page by page, instead of sending a + request for every page in parallel. It stops once it reaches a page without + organic results. + +* Improved the description of + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy` + values. + +* Fixed type hint issues related to Scrapy. + + 0.9.0 (2024-09-17) ------------------ diff --git a/docs/conf.py b/docs/conf.py index 406490e..a526125 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,6 +46,10 @@ "https://web-poet.readthedocs.io/en/stable", None, ), + "zyte": ( + "https://docs.zyte.com", + None, + ), "zyte-common-items": ( "https://zyte-common-items.readthedocs.io/en/latest", None, @@ -57,6 +61,7 @@ autodoc_pydantic_model_show_json = False autodoc_pydantic_model_show_validator_members = False autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False # sphinx-reredirects redirects = { diff --git a/docs/reference/index.rst b/docs/reference/index.rst index dd368dd..d623779 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -23,6 +23,14 @@ Pages Parameter mixins ================ +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.params.CustomAttrsMethod + .. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam :exclude-members: model_computed_fields diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index e74f3f8..0ef628a 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -155,7 +155,7 @@ def validate_input_group(model): class UrlsFileParam(BaseModel): - urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] + urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] @model_validator(mode="after") def input_group(self): @@ -193,7 +193,7 @@ def parse_input_params(spider): class UrlParam(BaseModel): - url: str = Field(**URL_FIELD_KWARGS) # type: ignore[misc, arg-type] + url: str = Field(**URL_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] URLS_FIELD_KWARGS = { @@ -247,7 +247,7 @@ def input_group(self): class UrlsParam(BaseModel): - urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] + urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] @model_validator(mode="after") def input_group(self): From 83a90dd27973c0dd3251d2bd117b4f5b33b29708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 22 Nov 2024 11:32:02 +0100 Subject: [PATCH 16/17] Set the release day --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 471a1cb..50ff9d1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.10.0 (2024-11-DD) +0.10.0 (2024-11-22) ------------------- * Dropped Python 3.8 support, added Python 3.13 support. From 71a5f7112a315b5170df9ea63b40624b749da201 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 22 Nov 2024 11:32:09 +0100 Subject: [PATCH 17/17] =?UTF-8?q?Bump=20version:=200.9.0=20=E2=86=92=200.1?= =?UTF-8?q?0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fbcf2ac..8b1c908 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.9.0 +current_version = 0.10.0 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index a526125..569c89e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,7 +4,7 @@ project = "zyte-spider-templates" copyright = "2023, Zyte Group Ltd" author = "Zyte Group Ltd" -release = "0.9.0" +release = "0.10.0" sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext extensions = [ diff --git a/setup.py b/setup.py index 76788ff..444bea7 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="zyte-spider-templates", - version="0.9.0", + version="0.10.0", description="Spider templates for automatic crawlers.", long_description=open("README.rst").read(), long_description_content_type="text/x-rst",