diff --git a/pyproject.toml b/pyproject.toml index cc7cb46..3f4ba8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,3 +8,4 @@ ignore_missing_imports = true [tool.black] target-version = ["py38", "py39", "py310", "py311", "py312"] +force-exclude = "template.py" diff --git a/setup.cfg b/setup.cfg index 3723f1f..d0b863f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,6 +26,9 @@ ignore = # First line should not be the function's "signature" D402 +exclude = + template.py + per-file-ignores = # F401: Ignore "imported but unused" errors in __init__ files, as those # imports are there to expose submodule functions so they can be imported diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 21d9ee4..3dca339 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -4,7 +4,6 @@ import pytest import requests import scrapy -from pydantic import ValidationError from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata from web_poet.page_inputs.browser import BrowserResponse @@ -21,43 +20,13 @@ GEOLOCATION_OPTIONS_WITH_CODE, Geolocation, ) -from zyte_spider_templates.spiders.ecommerce import ( - EcommerceCrawlStrategy, - EcommerceSpider, -) +from zyte_spider_templates.spiders.ecommerce import EcommerceSpider from . import get_crawler from .test_utils import URL_TO_DOMAIN from .utils import assertEqualSpiderMetadata -def test_parameters(): - with pytest.raises(ValidationError): - EcommerceSpider() - - EcommerceSpider(url="https://example.com") - EcommerceSpider( - url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.automatic - ) - EcommerceSpider(url="https://example.com", crawl_strategy="automatic") - - with pytest.raises(ValidationError): - EcommerceSpider(url="https://example.com", crawl_strategy="unknown") - - EcommerceSpider( - url="https://example.com", crawl_strategy="direct_item", search_queries="" - ) - EcommerceSpider( - url="https://example.com", crawl_strategy="automatic", search_queries="foo" - ) - with pytest.raises(ValidationError): - EcommerceSpider( - url="https://example.com", - crawl_strategy="direct_item", - search_queries="foo", - ) - - def test_start_requests(): url = "https://example.com" crawler = get_crawler() @@ -305,108 +274,6 @@ def test_parse_search_request_template_probability(probability, yields_items): assert items if yields_items else not items -def test_arguments(): - # Ensure passing no arguments works. - crawler = get_crawler() - - # Needed since it's a required argument. - base_kwargs = {"url": "https://example.com"} - - EcommerceSpider.from_crawler(crawler, **base_kwargs) - - for param, arg, setting, old_setting_value, getter_name, new_setting_value in ( - ("max_requests", "123", "ZYTE_API_MAX_REQUESTS", None, "getint", 123), - ( - "geolocation", - "DE", - "ZYTE_API_AUTOMAP_PARAMS", - None, - "getdict", - {"geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_AUTOMAP_PARAMS", - '{"browserHtml": true}', - "getdict", - {"browserHtml": True, "geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_AUTOMAP_PARAMS", - '{"geolocation": "IE"}', - "getdict", - {"geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_PROVIDER_PARAMS", - None, - "getdict", - {"geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_PROVIDER_PARAMS", - '{"browserHtml": true}', - "getdict", - {"browserHtml": True, "geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_PROVIDER_PARAMS", - '{"geolocation": "IE"}', - "getdict", - {"geolocation": "DE"}, - ), - ( - "extract_from", - "browserHtml", - "ZYTE_API_PROVIDER_PARAMS", - None, - "getdict", - { - "productOptions": {"extractFrom": "browserHtml"}, - "productNavigationOptions": {"extractFrom": "browserHtml"}, - }, - ), - ( - "extract_from", - "httpResponseBody", - "ZYTE_API_PROVIDER_PARAMS", - {"geolocation": "US"}, - "getdict", - { - "productOptions": {"extractFrom": "httpResponseBody"}, - "productNavigationOptions": {"extractFrom": "httpResponseBody"}, - "geolocation": "US", - }, - ), - ( - "extract_from", - None, - "ZYTE_API_PROVIDER_PARAMS", - {"geolocation": "US"}, - "getdict", - {"geolocation": "US"}, - ), - ): - kwargs = {param: arg} - settings = {} - if old_setting_value is not None: - settings[setting] = old_setting_value - crawler = get_crawler(settings=settings) - spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs) - getter = getattr(crawler.settings, getter_name) - assert getter(setting) == new_setting_value - assert spider.allowed_domains == ["example.com"] # type: ignore[attr-defined] - - def test_metadata(): actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True) expected_metadata = { @@ -550,11 +417,7 @@ def test_metadata(): {"type": "null"}, ], "default": None, - "description": ( - "ISO 3166-1 alpha-2 2-character string specified in " - "https://docs.zyte.com/zyte-api/usage/reference.html" - "#operation/extract/request/geolocation." - ), + "description": "Country of the IP addresses to use.", "enumMeta": { code: { "title": GEOLOCATION_OPTIONS_WITH_CODE[code], diff --git a/tests/test_params.py b/tests/test_params.py index df08a19..bc6bd15 100644 --- a/tests/test_params.py +++ b/tests/test_params.py @@ -1,8 +1,13 @@ import re import pytest +from pydantic import ValidationError +from zyte_spider_templates import EcommerceSpider, GoogleSearchSpider from zyte_spider_templates.params import URL_FIELD_KWARGS +from zyte_spider_templates.spiders.ecommerce import EcommerceCrawlStrategy + +from . import get_crawler @pytest.mark.parametrize( @@ -49,3 +54,218 @@ def test_url_pattern(url, valid): assert isinstance(URL_FIELD_KWARGS["pattern"], str) assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid + + +REQUIRED_ARGS = { + EcommerceSpider: {"url": "https://example.com"}, + GoogleSearchSpider: {"search_queries": "foo"}, +} + + +@pytest.mark.parametrize( + ("spider_cls",), ((spider_cls,) for spider_cls in REQUIRED_ARGS) +) +def test_required_args(spider_cls): + crawler = get_crawler() + + with pytest.raises(ValidationError): + spider_cls.from_crawler(crawler) + + spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls]) + + +@pytest.mark.parametrize( + ("spider_cls", "args", "valid"), + ( + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": EcommerceCrawlStrategy.automatic, + }, + True, + ), + ( + EcommerceSpider, + {"url": "https://example.com", "crawl_strategy": "automatic"}, + True, + ), + ( + EcommerceSpider, + {"url": "https://example.com", "crawl_strategy": "unknown"}, + False, + ), + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": "direct_item", + "search_queries": "", + }, + True, + ), + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": "automatic", + "search_queries": "foo", + }, + True, + ), + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": "direct_item", + "search_queries": "foo", + }, + False, + ), + (GoogleSearchSpider, {"domain": "google.com"}, False), + ( + GoogleSearchSpider, + {"domain": "google.cat", "search_queries": "foo bar"}, + True, + ), + ( + GoogleSearchSpider, + {"domain": "google.cat", "search_queries": "foo bar", "max_pages": 10}, + True, + ), + ( + GoogleSearchSpider, + {"domain": "google.foo", "search_queries": "foo bar"}, + False, + ), + (GoogleSearchSpider, {"search_queries": "foo bar", "max_pages": "all"}, False), + (GoogleSearchSpider, {"search_queries": "foo", "results_per_page": 0}, False), + ), +) +def test_arg_combinations(spider_cls, args, valid): + crawler = get_crawler() + if valid: + spider_cls.from_crawler(crawler, **args) + else: + with pytest.raises(ValidationError): + spider_cls.from_crawler(crawler, **args) + + +@pytest.mark.parametrize( + ("spider_cls", "param", "arg", "setting", "old", "getter", "new"), + ( + # extract_from + *( + (EcommerceSpider, *scenario) + for scenario in ( + ( + "extract_from", + "browserHtml", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + { + "productOptions": {"extractFrom": "browserHtml"}, + "productNavigationOptions": {"extractFrom": "browserHtml"}, + }, + ), + ( + "extract_from", + "httpResponseBody", + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + { + "productOptions": {"extractFrom": "httpResponseBody"}, + "productNavigationOptions": {"extractFrom": "httpResponseBody"}, + "geolocation": "US", + }, + ), + ( + "extract_from", + None, + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + {"geolocation": "US"}, + ), + ) + ), + # geolocation + *( + (spider_cls, *scenario) + for spider_cls in (EcommerceSpider, GoogleSearchSpider) + for scenario in ( + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ) + ), + # max_requests + *( + ( + spider_cls, + "max_requests", + "123", + "ZYTE_API_MAX_REQUESTS", + None, + "getint", + 123, + ) + for spider_cls in (EcommerceSpider, GoogleSearchSpider) + ), + ), +) +def test_setting_setter_params(spider_cls, param, arg, setting, old, getter, new): + settings = {} + if old is not None: + settings[setting] = old + crawler = get_crawler(settings=settings) + spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls], **{param: arg}) + read = getattr(crawler.settings, getter) + assert read(setting) == new diff --git a/tests/test_serp.py b/tests/test_serp.py index 571fca9..cd1a549 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -1,13 +1,22 @@ from urllib.parse import quote_plus import pytest -from pydantic import ValidationError from scrapy import Request from scrapy_spider_metadata import get_spider_metadata from scrapy_zyte_api.responses import ZyteAPITextResponse from w3lib.url import add_or_replace_parameter from zyte_common_items import Product +from zyte_spider_templates._geolocations import ( + GEOLOCATION_OPTIONS, + GEOLOCATION_OPTIONS_WITH_CODE, + Geolocation, +) +from zyte_spider_templates.spiders._google_gl import ( + GOOGLE_GL_OPTIONS, + GOOGLE_GL_OPTIONS_WITH_CODE, + GoogleGl, +) from zyte_spider_templates.spiders.serp import ( ITEM_TYPE_CLASSES, GoogleSearchSpider, @@ -18,7 +27,7 @@ from .utils import assertEqualSpiderMetadata -def run_parse_serp(spider, total_results=99999, page=1, query="foo"): +def run_parse_serp(spider, total_results=99999, page=1, query="foo", results=10): url = f"https://www.google.com/search?q={quote_plus(query)}" if page > 1: url = add_or_replace_parameter(url, "start", (page - 1) * 10) @@ -32,7 +41,7 @@ def run_parse_serp(spider, total_results=99999, page=1, query="foo"): "url": f"https://example.com/{rank}", "rank": rank, } - for rank in range(1, 11) + for rank in range(1, results + 1) ], "metadata": { "dateDownloaded": "2024-10-25T08:59:45Z", @@ -56,24 +65,6 @@ def run_parse_serp(spider, total_results=99999, page=1, query="foo"): return items, requests -def test_parameters(): - with pytest.raises(ValidationError): - GoogleSearchSpider() - - with pytest.raises(ValidationError): - GoogleSearchSpider(domain="google.com") - - GoogleSearchSpider(search_queries="foo bar") - GoogleSearchSpider(domain="google.cat", search_queries="foo bar") - GoogleSearchSpider(domain="google.cat", search_queries="foo bar", max_pages=10) - - with pytest.raises(ValidationError): - GoogleSearchSpider(domain="google.foo", search_queries="foo bar") - - with pytest.raises(ValidationError): - GoogleSearchSpider(search_queries="foo bar", max_pages="all") - - def test_start_requests(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") @@ -295,6 +286,19 @@ def test_metadata(): "title": "Search Queries", "widget": "textarea", }, + "max_requests": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 100, + "description": ( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + "title": "Max Requests", + "widget": "request-limit", + }, "max_pages": { "default": 1, "description": ( @@ -337,18 +341,57 @@ def test_metadata(): ], "title": "Follow and Extract", }, - "max_requests": { - "anyOf": [{"type": "integer"}, {"type": "null"}], - "default": 100, + "gl": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, "description": ( - "The maximum number of Zyte API requests allowed for the crawl.\n" - "\n" - "Requests with error responses that cannot be retried or exceed " - "their retry limit also count here, but they incur in no costs " - "and do not increase the request count in Scrapy Cloud." + "Boosts results relevant to this country. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.gl" + ), + "enumMeta": { + code: { + "title": GOOGLE_GL_OPTIONS_WITH_CODE[code], + } + for code in sorted(GoogleGl) + }, + "title": "User Country", + "enum": list( + sorted(GOOGLE_GL_OPTIONS, key=GOOGLE_GL_OPTIONS.__getitem__) + ), + }, + "cr": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "Restricts search results to documents originating in " + "particular countries. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.cr" + ), + "title": "Content Countries", + }, + "geolocation": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": "Country of the IP addresses to use.", + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in sorted(Geolocation) + }, + "title": "IP Country", + "enum": list( + sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__) ), - "title": "Max Requests", - "widget": "request-limit", }, }, "required": ["search_queries"], @@ -358,6 +401,11 @@ def test_metadata(): } assertEqualSpiderMetadata(actual_metadata, expected_metadata) + geolocation = actual_metadata["param_schema"]["properties"]["geolocation"] + assert geolocation["enum"][0] == "AF" + assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"} + assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) + def test_input_none(): crawler = get_crawler() @@ -438,6 +486,26 @@ def test_pagination(): assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" assert requests[0].cb_kwargs["page_number"] == 3 + items, requests = run_parse_serp( + spider, + total_results=None, + ) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=10" + assert requests[0].cb_kwargs["page_number"] == 2 + + # Ensure a lack of results stops pagination even if total_results reports + # additional results. + # https://github.com/zytedata/zyte-spider-templates/pull/80/files/359c342008e2e4d5a913d450ddd2dda6c887747c#r1840897802 + items, requests = run_parse_serp( + spider, + total_results=None, + results=0, + ) + assert len(items) == 1 + assert len(requests) == 0 + # Do not go over max_pages items, requests = run_parse_serp( spider, @@ -508,6 +576,42 @@ def test_parse_serp(): spider.parse_serp(response) # type: ignore[call-arg] +def test_cr(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", cr="(-countryFR).(-countryIT)", max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert ( + requests[0].url + == "https://www.google.com/search?q=foo&cr=%28-countryFR%29.%28-countryIT%29" + ) + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert ( + requests[0].url + == "https://www.google.com/search?q=foo&start=10&cr=%28-countryFR%29.%28-countryIT%29" + ) + + +def test_gl(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", gl="af", max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&gl=af" + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=10&gl=af" + + def test_results_per_page(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler( @@ -523,14 +627,6 @@ def test_results_per_page(): assert requests[0].url == "https://www.google.com/search?q=foo&start=1&num=1" -def test_results_per_page_min(): - crawler = get_crawler() - with pytest.raises(ValidationError): - GoogleSearchSpider.from_crawler( - crawler, search_queries="foo", results_per_page=0 - ) - - def test_item_type(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler( diff --git a/utils/google-gl-updater/requirements.in b/utils/google-gl-updater/requirements.in new file mode 100644 index 0000000..25d38c0 --- /dev/null +++ b/utils/google-gl-updater/requirements.in @@ -0,0 +1,3 @@ +jinja2 +parsel +requests diff --git a/utils/google-gl-updater/requirements.txt b/utils/google-gl-updater/requirements.txt new file mode 100644 index 0000000..93b80f5 --- /dev/null +++ b/utils/google-gl-updater/requirements.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile +# +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests +cssselect==1.2.0 + # via parsel +idna==3.10 + # via requests +jinja2==3.1.4 + # via -r requirements.in +jmespath==1.0.1 + # via parsel +lxml==5.3.0 + # via parsel +markupsafe==3.0.2 + # via jinja2 +packaging==24.2 + # via parsel +parsel==1.9.1 + # via -r requirements.in +requests==2.32.3 + # via -r requirements.in +urllib3==2.2.3 + # via requests +w3lib==2.2.1 + # via parsel diff --git a/utils/google-gl-updater/template.py b/utils/google-gl-updater/template.py new file mode 100644 index 0000000..9112d9a --- /dev/null +++ b/utils/google-gl-updater/template.py @@ -0,0 +1,18 @@ +{% raw %}# ../_geolocations.py counterpart for +# https://developers.google.com/custom-search/docs/json_api_reference#countryCodes +# +# Built automatically with ../../utils/google-gl-updater + +from enum import Enum + +GOOGLE_GL_OPTIONS = {{% endraw %}{% for country in countries %} + "{{ country.code }}": "{{ country.name }}",{% endfor %}{% raw %} +} +GOOGLE_GL_OPTIONS_WITH_CODE = { + code: f"{name} ({code})" for code, name in GOOGLE_GL_OPTIONS.items() +} + + +class GoogleGl(str, Enum):{% endraw %}{% for country in countries %} + {{ country.keyword }}: str = "{{ country.code }}"{% endfor %} + diff --git a/utils/google-gl-updater/update.py b/utils/google-gl-updater/update.py new file mode 100644 index 0000000..28f7d63 --- /dev/null +++ b/utils/google-gl-updater/update.py @@ -0,0 +1,35 @@ +from keyword import iskeyword +from pathlib import Path + +import jinja2 +import requests +from parsel import Selector + +countries = [] + +response = requests.get( + "https://developers.google.com/custom-search/docs/json_api_reference" +) +selector = Selector(text=response.text) +table = selector.xpath('//*[@id="country-codes"]/following-sibling::table[1]') +for tr in table.css("tr"): + name = tr.xpath("td/text()").get() + if not name: # header + continue + code = tr.xpath("td/span/text()").get() + keyword = f"{code}_" if iskeyword(code) else code + countries.append({"code": code, "keyword": keyword, "name": name}) + +template_path = Path(__file__).parent / "template.py" +template_environment = jinja2.Environment() +with template_path.open() as f: + template = template_environment.from_string(f.read()) +output = template.render(countries=countries) +output_path = ( + Path(__file__).parent.parent.parent + / "zyte_spider_templates" + / "spiders" + / "_google_gl.py" +) +with output_path.open("w") as f: + f.write(output) diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index 0500ac8..d844fc2 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -78,8 +78,7 @@ class ExtractFromParam(BaseModel): class GeolocationParam(BaseModel): geolocation: Optional[Geolocation] = Field( title="Geolocation", - description="ISO 3166-1 alpha-2 2-character string specified in " - "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.", + description="Country of the IP addresses to use.", default=None, json_schema_extra={ "enumMeta": { diff --git a/zyte_spider_templates/spiders/_google_gl.py b/zyte_spider_templates/spiders/_google_gl.py new file mode 100644 index 0000000..6e01d38 --- /dev/null +++ b/zyte_spider_templates/spiders/_google_gl.py @@ -0,0 +1,493 @@ +# ../_geolocations.py counterpart for +# https://developers.google.com/custom-search/docs/json_api_reference#countryCodes +# +# Built automatically with ../../utils/google-gl-updater + +from enum import Enum + +GOOGLE_GL_OPTIONS = { + "af": "Afghanistan", + "al": "Albania", + "dz": "Algeria", + "as": "American Samoa", + "ad": "Andorra", + "ao": "Angola", + "ai": "Anguilla", + "aq": "Antarctica", + "ag": "Antigua and Barbuda", + "ar": "Argentina", + "am": "Armenia", + "aw": "Aruba", + "au": "Australia", + "at": "Austria", + "az": "Azerbaijan", + "bs": "Bahamas", + "bh": "Bahrain", + "bd": "Bangladesh", + "bb": "Barbados", + "by": "Belarus", + "be": "Belgium", + "bz": "Belize", + "bj": "Benin", + "bm": "Bermuda", + "bt": "Bhutan", + "bo": "Bolivia", + "ba": "Bosnia and Herzegovina", + "bw": "Botswana", + "bv": "Bouvet Island", + "br": "Brazil", + "io": "British Indian Ocean Territory", + "bn": "Brunei Darussalam", + "bg": "Bulgaria", + "bf": "Burkina Faso", + "bi": "Burundi", + "kh": "Cambodia", + "cm": "Cameroon", + "ca": "Canada", + "cv": "Cape Verde", + "ky": "Cayman Islands", + "cf": "Central African Republic", + "td": "Chad", + "cl": "Chile", + "cn": "China", + "cx": "Christmas Island", + "cc": "Cocos (Keeling) Islands", + "co": "Colombia", + "km": "Comoros", + "cg": "Congo", + "cd": "Congo, the Democratic Republic of the", + "ck": "Cook Islands", + "cr": "Costa Rica", + "ci": "Cote D'ivoire", + "hr": "Croatia", + "cu": "Cuba", + "cy": "Cyprus", + "cz": "Czech Republic", + "dk": "Denmark", + "dj": "Djibouti", + "dm": "Dominica", + "do": "Dominican Republic", + "ec": "Ecuador", + "eg": "Egypt", + "sv": "El Salvador", + "gq": "Equatorial Guinea", + "er": "Eritrea", + "ee": "Estonia", + "et": "Ethiopia", + "fk": "Falkland Islands (Malvinas)", + "fo": "Faroe Islands", + "fj": "Fiji", + "fi": "Finland", + "fr": "France", + "gf": "French Guiana", + "pf": "French Polynesia", + "tf": "French Southern Territories", + "ga": "Gabon", + "gm": "Gambia", + "ge": "Georgia", + "de": "Germany", + "gh": "Ghana", + "gi": "Gibraltar", + "gr": "Greece", + "gl": "Greenland", + "gd": "Grenada", + "gp": "Guadeloupe", + "gu": "Guam", + "gt": "Guatemala", + "gn": "Guinea", + "gw": "Guinea-Bissau", + "gy": "Guyana", + "ht": "Haiti", + "hm": "Heard Island and Mcdonald Islands", + "va": "Holy See (Vatican City State)", + "hn": "Honduras", + "hk": "Hong Kong", + "hu": "Hungary", + "is": "Iceland", + "in": "India", + "id": "Indonesia", + "ir": "Iran, Islamic Republic of", + "iq": "Iraq", + "ie": "Ireland", + "il": "Israel", + "it": "Italy", + "jm": "Jamaica", + "jp": "Japan", + "jo": "Jordan", + "kz": "Kazakhstan", + "ke": "Kenya", + "ki": "Kiribati", + "kp": "Korea, Democratic People's Republic of", + "kr": "Korea, Republic of", + "kw": "Kuwait", + "kg": "Kyrgyzstan", + "la": "Lao People's Democratic Republic", + "lv": "Latvia", + "lb": "Lebanon", + "ls": "Lesotho", + "lr": "Liberia", + "ly": "Libyan Arab Jamahiriya", + "li": "Liechtenstein", + "lt": "Lithuania", + "lu": "Luxembourg", + "mo": "Macao", + "mk": "Macedonia, the Former Yugosalv Republic of", + "mg": "Madagascar", + "mw": "Malawi", + "my": "Malaysia", + "mv": "Maldives", + "ml": "Mali", + "mt": "Malta", + "mh": "Marshall Islands", + "mq": "Martinique", + "mr": "Mauritania", + "mu": "Mauritius", + "yt": "Mayotte", + "mx": "Mexico", + "fm": "Micronesia, Federated States of", + "md": "Moldova, Republic of", + "mc": "Monaco", + "mn": "Mongolia", + "ms": "Montserrat", + "ma": "Morocco", + "mz": "Mozambique", + "mm": "Myanmar", + "na": "Namibia", + "nr": "Nauru", + "np": "Nepal", + "nl": "Netherlands", + "an": "Netherlands Antilles", + "nc": "New Caledonia", + "nz": "New Zealand", + "ni": "Nicaragua", + "ne": "Niger", + "ng": "Nigeria", + "nu": "Niue", + "nf": "Norfolk Island", + "mp": "Northern Mariana Islands", + "no": "Norway", + "om": "Oman", + "pk": "Pakistan", + "pw": "Palau", + "ps": "Palestinian Territory, Occupied", + "pa": "Panama", + "pg": "Papua New Guinea", + "py": "Paraguay", + "pe": "Peru", + "ph": "Philippines", + "pn": "Pitcairn", + "pl": "Poland", + "pt": "Portugal", + "pr": "Puerto Rico", + "qa": "Qatar", + "re": "Reunion", + "ro": "Romania", + "ru": "Russian Federation", + "rw": "Rwanda", + "sh": "Saint Helena", + "kn": "Saint Kitts and Nevis", + "lc": "Saint Lucia", + "pm": "Saint Pierre and Miquelon", + "vc": "Saint Vincent and the Grenadines", + "ws": "Samoa", + "sm": "San Marino", + "st": "Sao Tome and Principe", + "sa": "Saudi Arabia", + "sn": "Senegal", + "cs": "Serbia and Montenegro", + "sc": "Seychelles", + "sl": "Sierra Leone", + "sg": "Singapore", + "sk": "Slovakia", + "si": "Slovenia", + "sb": "Solomon Islands", + "so": "Somalia", + "za": "South Africa", + "gs": "South Georgia and the South Sandwich Islands", + "es": "Spain", + "lk": "Sri Lanka", + "sd": "Sudan", + "sr": "Suriname", + "sj": "Svalbard and Jan Mayen", + "sz": "Swaziland", + "se": "Sweden", + "ch": "Switzerland", + "sy": "Syrian Arab Republic", + "tw": "Taiwan, Province of China", + "tj": "Tajikistan", + "tz": "Tanzania, United Republic of", + "th": "Thailand", + "tl": "Timor-Leste", + "tg": "Togo", + "tk": "Tokelau", + "to": "Tonga", + "tt": "Trinidad and Tobago", + "tn": "Tunisia", + "tr": "Turkey", + "tm": "Turkmenistan", + "tc": "Turks and Caicos Islands", + "tv": "Tuvalu", + "ug": "Uganda", + "ua": "Ukraine", + "ae": "United Arab Emirates", + "uk": "United Kingdom", + "us": "United States", + "um": "United States Minor Outlying Islands", + "uy": "Uruguay", + "uz": "Uzbekistan", + "vu": "Vanuatu", + "ve": "Venezuela", + "vn": "Viet Nam", + "vg": "Virgin Islands, British", + "vi": "Virgin Islands, U.S.", + "wf": "Wallis and Futuna", + "eh": "Western Sahara", + "ye": "Yemen", + "zm": "Zambia", + "zw": "Zimbabwe", +} +GOOGLE_GL_OPTIONS_WITH_CODE = { + code: f"{name} ({code})" for code, name in GOOGLE_GL_OPTIONS.items() +} + + +class GoogleGl(str, Enum): + af: str = "af" + al: str = "al" + dz: str = "dz" + as_: str = "as" + ad: str = "ad" + ao: str = "ao" + ai: str = "ai" + aq: str = "aq" + ag: str = "ag" + ar: str = "ar" + am: str = "am" + aw: str = "aw" + au: str = "au" + at: str = "at" + az: str = "az" + bs: str = "bs" + bh: str = "bh" + bd: str = "bd" + bb: str = "bb" + by: str = "by" + be: str = "be" + bz: str = "bz" + bj: str = "bj" + bm: str = "bm" + bt: str = "bt" + bo: str = "bo" + ba: str = "ba" + bw: str = "bw" + bv: str = "bv" + br: str = "br" + io: str = "io" + bn: str = "bn" + bg: str = "bg" + bf: str = "bf" + bi: str = "bi" + kh: str = "kh" + cm: str = "cm" + ca: str = "ca" + cv: str = "cv" + ky: str = "ky" + cf: str = "cf" + td: str = "td" + cl: str = "cl" + cn: str = "cn" + cx: str = "cx" + cc: str = "cc" + co: str = "co" + km: str = "km" + cg: str = "cg" + cd: str = "cd" + ck: str = "ck" + cr: str = "cr" + ci: str = "ci" + hr: str = "hr" + cu: str = "cu" + cy: str = "cy" + cz: str = "cz" + dk: str = "dk" + dj: str = "dj" + dm: str = "dm" + do: str = "do" + ec: str = "ec" + eg: str = "eg" + sv: str = "sv" + gq: str = "gq" + er: str = "er" + ee: str = "ee" + et: str = "et" + fk: str = "fk" + fo: str = "fo" + fj: str = "fj" + fi: str = "fi" + fr: str = "fr" + gf: str = "gf" + pf: str = "pf" + tf: str = "tf" + ga: str = "ga" + gm: str = "gm" + ge: str = "ge" + de: str = "de" + gh: str = "gh" + gi: str = "gi" + gr: str = "gr" + gl: str = "gl" + gd: str = "gd" + gp: str = "gp" + gu: str = "gu" + gt: str = "gt" + gn: str = "gn" + gw: str = "gw" + gy: str = "gy" + ht: str = "ht" + hm: str = "hm" + va: str = "va" + hn: str = "hn" + hk: str = "hk" + hu: str = "hu" + is_: str = "is" + in_: str = "in" + id: str = "id" + ir: str = "ir" + iq: str = "iq" + ie: str = "ie" + il: str = "il" + it: str = "it" + jm: str = "jm" + jp: str = "jp" + jo: str = "jo" + kz: str = "kz" + ke: str = "ke" + ki: str = "ki" + kp: str = "kp" + kr: str = "kr" + kw: str = "kw" + kg: str = "kg" + la: str = "la" + lv: str = "lv" + lb: str = "lb" + ls: str = "ls" + lr: str = "lr" + ly: str = "ly" + li: str = "li" + lt: str = "lt" + lu: str = "lu" + mo: str = "mo" + mk: str = "mk" + mg: str = "mg" + mw: str = "mw" + my: str = "my" + mv: str = "mv" + ml: str = "ml" + mt: str = "mt" + mh: str = "mh" + mq: str = "mq" + mr: str = "mr" + mu: str = "mu" + yt: str = "yt" + mx: str = "mx" + fm: str = "fm" + md: str = "md" + mc: str = "mc" + mn: str = "mn" + ms: str = "ms" + ma: str = "ma" + mz: str = "mz" + mm: str = "mm" + na: str = "na" + nr: str = "nr" + np: str = "np" + nl: str = "nl" + an: str = "an" + nc: str = "nc" + nz: str = "nz" + ni: str = "ni" + ne: str = "ne" + ng: str = "ng" + nu: str = "nu" + nf: str = "nf" + mp: str = "mp" + no: str = "no" + om: str = "om" + pk: str = "pk" + pw: str = "pw" + ps: str = "ps" + pa: str = "pa" + pg: str = "pg" + py: str = "py" + pe: str = "pe" + ph: str = "ph" + pn: str = "pn" + pl: str = "pl" + pt: str = "pt" + pr: str = "pr" + qa: str = "qa" + re: str = "re" + ro: str = "ro" + ru: str = "ru" + rw: str = "rw" + sh: str = "sh" + kn: str = "kn" + lc: str = "lc" + pm: str = "pm" + vc: str = "vc" + ws: str = "ws" + sm: str = "sm" + st: str = "st" + sa: str = "sa" + sn: str = "sn" + cs: str = "cs" + sc: str = "sc" + sl: str = "sl" + sg: str = "sg" + sk: str = "sk" + si: str = "si" + sb: str = "sb" + so: str = "so" + za: str = "za" + gs: str = "gs" + es: str = "es" + lk: str = "lk" + sd: str = "sd" + sr: str = "sr" + sj: str = "sj" + sz: str = "sz" + se: str = "se" + ch: str = "ch" + sy: str = "sy" + tw: str = "tw" + tj: str = "tj" + tz: str = "tz" + th: str = "th" + tl: str = "tl" + tg: str = "tg" + tk: str = "tk" + to: str = "to" + tt: str = "tt" + tn: str = "tn" + tr: str = "tr" + tm: str = "tm" + tc: str = "tc" + tv: str = "tv" + ug: str = "ug" + ua: str = "ua" + ae: str = "ae" + uk: str = "uk" + us: str = "us" + um: str = "um" + uy: str = "uy" + uz: str = "uz" + vu: str = "vu" + ve: str = "ve" + vn: str = "vn" + vg: str = "vg" + vi: str = "vi" + wf: str = "wf" + eh: str = "eh" + ye: str = "ye" + zm: str = "zm" + zw: str = "zw" diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index f0a5b26..f83deb8 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -17,12 +17,45 @@ Serp, ) +from .._geolocations import GEOLOCATION_OPTIONS_WITH_CODE, Geolocation from ..documentation import document_enum from ..params import MaxRequestsParam from ._google_domains import GoogleDomain +from ._google_gl import GOOGLE_GL_OPTIONS_WITH_CODE, GoogleGl from .base import BaseSpider +class GoogleCrParam(BaseModel): + cr: Optional[str] = Field( + title="Content Countries", + description=( + "Restricts search results to documents originating in " + "particular countries. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.cr" + ), + default=None, + ) + + +class GoogleGlParam(BaseModel): + gl: Optional[GoogleGl] = Field( + title="User Country", + description=( + "Boosts results relevant to this country. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.gl" + ), + default=None, + json_schema_extra={ + "enumMeta": { + code: { + "title": GOOGLE_GL_OPTIONS_WITH_CODE[code], + } + for code in GoogleGl + } + }, + ) + + class SearchQueriesParam(BaseModel): search_queries: Optional[List[str]] = Field( title="Search Queries", @@ -50,6 +83,26 @@ def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: return result +class SerpGeolocationParam(BaseModel): + # We use “geolocation” as parameter name (instead of e.g. “ip_geolocation”) + # to reuse the implementation in BaseSpider. + geolocation: Optional[Geolocation] = Field( + # The title, worded like this for contrast with gl, is the reason why + # ..params.GeolocationParam is not used. + title="IP Country", + description="Country of the IP addresses to use.", + default=None, + json_schema_extra={ + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in Geolocation + } + }, + ) + + class SerpMaxPagesParam(BaseModel): max_pages: int = Field( title="Max Pages", @@ -133,10 +186,13 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( - MaxRequestsParam, + SerpGeolocationParam, + GoogleCrParam, + GoogleGlParam, SerpItemTypeParam, SerpResultsPerPageParam, SerpMaxPagesParam, + MaxRequestsParam, SearchQueriesParam, GoogleDomainParam, BaseModel, @@ -177,6 +233,10 @@ def update_settings(cls, settings: BaseSettings) -> None: ) def get_serp_request(self, url: str, *, page_number: int): + if self.args.cr: + url = add_or_replace_parameter(url, "cr", self.args.cr) + if self.args.gl: + url = add_or_replace_parameter(url, "gl", self.args.gl.value) if self.args.results_per_page: url = add_or_replace_parameter(url, "num", str(self.args.results_per_page)) return Request( @@ -210,7 +270,10 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: next_start = page_number * ( self.args.results_per_page or self._default_results_per_page ) - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + if serp.organicResults and ( + serp.metadata.totalOrganicResults is None + or serp.metadata.totalOrganicResults > next_start + ): next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) yield self.get_serp_request(next_url, page_number=page_number + 1)