Skip to content

Commit

Permalink
Merge remote-tracking branch 'zytedata/main' into search
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Nov 22, 2024
2 parents a5f3be4 + 71a5f71 commit a298fc1
Show file tree
Hide file tree
Showing 12 changed files with 265 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.9.0
current_version = 0.10.0
commit = True
tag = True
tag_name = {new_version}
Expand Down
41 changes: 41 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,47 @@
Changes
=======

0.10.0 (2024-11-22)
-------------------

* Dropped Python 3.8 support, added Python 3.13 support.

* Increased the minimum required versions of some dependencies:

* ``pydantic``: ``2`` → ``2.1``

* ``scrapy-poet``: ``0.21.0`` → ``0.24.0``

* ``scrapy-spider-metadata``: ``0.1.2`` → ``0.2.0``

* ``scrapy-zyte-api[provider]``: ``0.16.0`` → ``0.23.0``

* ``zyte-common-items``: ``0.22.0`` → ``0.23.0``

* Added :ref:`custom attributes <custom-attributes>` support to the
:ref:`e-commerce spider template <e-commerce>` through its new
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_input`
and
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_method`
parameters.

* The
:class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams.max_pages`
parameter of the :ref:`Google Search spider template <google-search>` can no
longer be 0 or lower.

* The :ref:`Google Search spider template <google-search>` now follows
pagination for the results of each query page by page, instead of sending a
request for every page in parallel. It stops once it reaches a page without
organic results.

* Improved the description of
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy`
values.

* Fixed type hint issues related to Scrapy.


0.9.0 (2024-09-17)
------------------

Expand Down
7 changes: 6 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
project = "zyte-spider-templates"
copyright = "2023, Zyte Group Ltd"
author = "Zyte Group Ltd"
release = "0.9.0"
release = "0.10.0"

sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext
extensions = [
Expand Down Expand Up @@ -54,6 +54,10 @@
"https://web-poet.readthedocs.io/en/stable",
None,
),
"zyte": (
"https://docs.zyte.com",
None,
),
"zyte-common-items": (
"https://zyte-common-items.readthedocs.io/en/latest",
None,
Expand All @@ -65,6 +69,7 @@
autodoc_pydantic_model_show_json = False
autodoc_pydantic_model_show_validator_members = False
autodoc_pydantic_model_show_validator_summary = False
autodoc_pydantic_field_list_validators = False

# sphinx-reredirects
redirects = {
Expand Down
8 changes: 8 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ Pages
Parameter mixins
================

.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam
:exclude-members: model_computed_fields

.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.params.CustomAttrsMethod

.. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam
:exclude-members: model_computed_fields

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="zyte-spider-templates",
version="0.9.0",
version="0.10.0",
description="Spider templates for automatic crawlers.",
long_description=open("README.rst").read(),
long_description_content_type="text/x-rst",
Expand Down
11 changes: 5 additions & 6 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
ProbabilityRequest,
Product,
ProductNavigation,
Request,
SearchRequestTemplate,
SearchRequestTemplateMetadata,
)
Expand Down Expand Up @@ -405,7 +404,7 @@ def test_arguments():
spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs)
getter = getattr(crawler.settings, getter_name)
assert getter(setting) == new_setting_value
assert spider.allowed_domains == ["example.com"]
assert spider.allowed_domains == ["example.com"] # type: ignore[attr-defined]


def test_metadata():
Expand Down Expand Up @@ -666,7 +665,7 @@ def test_get_subcategory_request():
url = "https://example.com"

# Normal request but with mostly empty values
request = Request(url)
request = ProbabilityRequest(url=url)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore
Expand Down Expand Up @@ -737,7 +736,7 @@ def test_get_nextpage_request():
url = "https://example.com"

# Minimal Args
request = Request(url)
request = ProbabilityRequest(url=url)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore
Expand All @@ -756,7 +755,7 @@ def test_get_parse_navigation_request():
url = "https://example.com"

# Minimal args
request = Request(url)
request = ProbabilityRequest(url=url)
spider = EcommerceSpider(url="https://example.com")
parse_navigation = lambda _: None
spider.parse_navigation = parse_navigation # type: ignore
Expand All @@ -781,7 +780,7 @@ def test_set_allowed_domains(url, allowed_domain):

kwargs = {"url": url}
spider = EcommerceSpider.from_crawler(crawler, **kwargs)
assert spider.allowed_domains == [allowed_domain]
assert spider.allowed_domains == [allowed_domain] # type: ignore[attr-defined]


def test_input_none():
Expand Down
145 changes: 145 additions & 0 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pytest
from pydantic import ValidationError
from scrapy import Request
from scrapy_spider_metadata import get_spider_metadata
from scrapy_zyte_api.responses import ZyteAPITextResponse
from w3lib.url import add_or_replace_parameter

from zyte_spider_templates.spiders.serp import GoogleSearchSpider

Expand Down Expand Up @@ -312,3 +315,145 @@ def test_search_queries():
assert len(requests) == 2
assert requests[0].url == "https://www.google.com/search?q=foo+bar"
assert requests[1].url == "https://www.google.com/search?q=baz"


def test_pagination():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=3
)

def run_parse_serp(total_results, page=1):
url = "https://www.google.com/search?q=foo+bar"
if page > 1:
url = add_or_replace_parameter(url, "start", (page - 1) * 10)
response = ZyteAPITextResponse.from_api_response(
api_response={
"serp": {
"organicResults": [
{
"description": "…",
"name": "…",
"url": f"https://example.com/{rank}",
"rank": rank,
}
for rank in range(1, 11)
],
"metadata": {
"dateDownloaded": "2024-10-25T08:59:45Z",
"displayedQuery": "foo bar",
"searchedQuery": "foo bar",
"totalOrganicResults": total_results,
},
"pageNumber": page,
"url": url,
},
"url": url,
},
)
items = []
requests = []
for item_or_request in spider.parse_serp(response, page_number=page):
if isinstance(item_or_request, Request):
requests.append(item_or_request)
else:
items.append(item_or_request)
return items, requests

items, requests = run_parse_serp(
total_results=10,
)
assert len(items) == 1
assert len(requests) == 0

items, requests = run_parse_serp(
total_results=11,
)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=10"
assert requests[0].cb_kwargs["page_number"] == 2

items, requests = run_parse_serp(
total_results=20,
page=2,
)
assert len(items) == 1
assert len(requests) == 0

items, requests = run_parse_serp(
total_results=21,
page=2,
)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20"
assert requests[0].cb_kwargs["page_number"] == 3

# Do not go over max_pages
items, requests = run_parse_serp(
total_results=31,
page=3,
)
assert len(items) == 1
assert len(requests) == 0


def test_get_serp_request():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar")
url = "https://www.google.com/search?q=foo+bar"

request = spider.get_serp_request(url, page_number=42)
assert request.cb_kwargs["page_number"] == 42

# The page_number parameter is required.
with pytest.raises(TypeError):
spider.get_serp_request(url) # type: ignore[call-arg]


def test_parse_serp():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo bar", max_pages=43
)
url = "https://www.google.com/search?q=foo+bar"
response = ZyteAPITextResponse.from_api_response(
api_response={
"serp": {
"organicResults": [
{
"description": "…",
"name": "…",
"url": f"https://example.com/{rank}",
"rank": rank,
}
for rank in range(1, 11)
],
"metadata": {
"dateDownloaded": "2024-10-25T08:59:45Z",
"displayedQuery": "foo bar",
"searchedQuery": "foo bar",
"totalOrganicResults": 99999,
},
"pageNumber": 1,
"url": url,
},
"url": url,
},
)
items = []
requests = []
for item_or_request in spider.parse_serp(response, page_number=42):
if isinstance(item_or_request, Request):
requests.append(item_or_request)
else:
items.append(item_or_request)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == add_or_replace_parameter(url, "start", "420")
assert requests[0].cb_kwargs["page_number"] == 43

# The page_number parameter is required.
with pytest.raises(TypeError):
spider.parse_serp(response) # type: ignore[call-arg]
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _probably_category_links(self) -> List[ProbabilityRequest]:
default_probability = 0.1

link_extractor = LinkExtractor(
allow_domains=self.page_params.get("full_domain")
allow_domains=self.page_params.get("full_domain", [])
)
ignore_urls = set(self._urls_for_category())

Expand Down
6 changes: 3 additions & 3 deletions zyte_spider_templates/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def validate_input_group(model):


class UrlsFileParam(BaseModel):
urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type]
urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type]

@model_validator(mode="after")
def input_group(self):
Expand Down Expand Up @@ -227,7 +227,7 @@ def parse_input_params(spider):


class UrlParam(BaseModel):
url: str = Field(**URL_FIELD_KWARGS) # type: ignore[misc, arg-type]
url: str = Field(**URL_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type]


URLS_FIELD_KWARGS = {
Expand Down Expand Up @@ -281,7 +281,7 @@ def input_group(self):


class UrlsParam(BaseModel):
urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[misc, arg-type]
urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type]

@model_validator(mode="after")
def input_group(self):
Expand Down
17 changes: 14 additions & 3 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from importlib.metadata import version
from typing import Annotated, Any, Dict
from typing import TYPE_CHECKING, Annotated, Any, Dict
from warnings import warn

import scrapy
Expand All @@ -19,6 +21,11 @@
UrlsParam,
)

if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self


# Higher priority than command-line-defined settings (40).
ARG_SETTING_PRIORITY: int = 50

Expand Down Expand Up @@ -55,7 +62,7 @@ def deprecated(self):


class BaseSpider(scrapy.Spider):
custom_settings: Dict[str, Any] = {
custom_settings: Dict[str, Any] = { # type: ignore[assignment]
"ZYTE_API_TRANSPARENT_MODE": True,
"_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}",
}
Expand All @@ -71,9 +78,13 @@ class BaseSpider(scrapy.Spider):
_custom_attrs_dep = None

@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self:
spider = super().from_crawler(crawler, *args, **kwargs)

# all subclasses of this need to also have Args as a subclass
# this may be possible to express in type hints instead
assert hasattr(spider, "args")

if geolocation := getattr(spider.args, "geolocation", None):
# We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected
# dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object
Expand Down
Loading

0 comments on commit a298fc1

Please sign in to comment.