diff --git a/docs/conf.py b/docs/conf.py index 569c89e..0d0a14e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,6 +22,14 @@ html_theme = "sphinx_rtd_theme" intersphinx_mapping = { + "form2request": ( + "https://form2request.readthedocs.io/en/latest", + None, + ), + "formasaurus": ( + "https://formasaurus.readthedocs.io/en/latest", + None, + ), "python": ( "https://docs.python.org/3", None, diff --git a/docs/customization/pages.rst b/docs/customization/pages.rst index f373788..46da4c9 100644 --- a/docs/customization/pages.rst +++ b/docs/customization/pages.rst @@ -6,7 +6,8 @@ Customizing page objects All parsing is implemented using :ref:`web-poet page objects ` that use `Zyte API automatic extraction`_ to extract :ref:`standard items -`, both for navigation and for item details. +`: for navigation, for item details, and even for :ref:`search +request generation `. .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html @@ -141,3 +142,27 @@ To extract a new field for one or more websites: def parse_product(self, response: DummyResponse, product: CustomProduct): yield from super().parse_product(response, product) + +.. _fix-search: + +Fixing search support +===================== + +If the default implementation to build a request out of :ref:`search queries +` does not work on a given website, you can implement your +own search request page object to fix that. See +:ref:`custom-request-template-page`. + +For example: + +.. code-block:: python + + from web_poet import handle_urls + from zyte_common_items import BaseSearchRequestTemplatePage + + + @handle_urls("example.com") + class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage): + @field + def url(self): + return "https://example.com/search?q={{ query|quote_plus }}" diff --git a/docs/features/search.rst b/docs/features/search.rst new file mode 100644 index 0000000..8dec02a --- /dev/null +++ b/docs/features/search.rst @@ -0,0 +1,43 @@ +.. _search-queries: + +============== +Search queries +============== + +The :ref:`e-commerce spider template ` supports a spider argument, +:data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`, +that allows you to define a different search query per line, and +turns the input URLs into search requests for those queries. + +For example, given the following input URLs: + +.. code-block:: none + + https://a.example + https://b.example + +And the following list of search queries: + +.. code-block:: none + + foo bar + baz + +By default, the spider would send 2 initial requests to those 2 input URLs, +to try and find out how to build a search request for them, and if it succeeds, +it will then send 4 search requests, 1 per combination of input URL and search +query. For example: + +.. code-block:: none + + https://a.example/search?q=foo+bar + https://a.example/search?q=baz + https://b.example/s/foo%20bar + https://b.example/s/baz + +The default implementation uses a combination of HTML metadata, AI-based HTML +form inspection and heuristics to find the most likely way to build a search +request for a given website. + +If this default implementation does not work as expected on a given website, +you can :ref:`write a page object to fix that `. diff --git a/docs/index.rst b/docs/index.rst index 1083299..dd568ea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,12 @@ zyte-spider-templates documentation E-commerce Google search +.. toctree:: + :caption: Features + :hidden: + + Search queries + .. toctree:: :caption: Customization :hidden: diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..1152570 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore:deprecated string literal syntax::jmespath.lexer diff --git a/setup.py b/setup.py index 444bea7..cf48be0 100644 --- a/setup.py +++ b/setup.py @@ -12,13 +12,18 @@ packages=find_packages(), include_package_data=True, install_requires=[ + "extruct>=0.18.0", + "form2request>=0.2.0", + "formasaurus>=0.10.0", + "jmespath>=0.9.5", "pydantic>=2.1", - "requests>=0.10.1", + "requests>=1.0.0", "scrapy>=2.11.0", "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", - "zyte-common-items>=0.23.0", + "web-poet>=0.17.1", + "zyte-common-items>=0.25.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index ee0f271..21d9ee4 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -7,7 +7,14 @@ from pydantic import ValidationError from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata -from zyte_common_items import ProbabilityRequest, Product, ProductNavigation +from web_poet.page_inputs.browser import BrowserResponse +from zyte_common_items import ( + ProbabilityRequest, + Product, + ProductNavigation, + SearchRequestTemplate, + SearchRequestTemplateMetadata, +) from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, @@ -37,6 +44,19 @@ def test_parameters(): with pytest.raises(ValidationError): EcommerceSpider(url="https://example.com", crawl_strategy="unknown") + EcommerceSpider( + url="https://example.com", crawl_strategy="direct_item", search_queries="" + ) + EcommerceSpider( + url="https://example.com", crawl_strategy="automatic", search_queries="foo" + ) + with pytest.raises(ValidationError): + EcommerceSpider( + url="https://example.com", + crawl_strategy="direct_item", + search_queries="foo", + ) + def test_start_requests(): url = "https://example.com" @@ -258,6 +278,33 @@ def test_parse_product(probability, has_item, item_drop, caplog): assert str(product) in caplog.text +@pytest.mark.parametrize( + ("probability", "yields_items"), + ( + (None, True), # Default + (-1.0, False), + (0.0, False), # page.no_item_found() + (1.0, True), + ), +) +def test_parse_search_request_template_probability(probability, yields_items): + crawler = get_crawler() + spider = EcommerceSpider.from_crawler( + crawler, url="https://example.com", search_queries="foo" + ) + search_request_template = SearchRequestTemplate(url="https://example.com") + if probability is not None: + search_request_template.metadata = SearchRequestTemplateMetadata( + probability=probability + ) + items = list( + spider.parse_search_request_template( + DummyResponse("https://example.com"), search_request_template, DynamicDeps() + ) + ) + assert items if yields_items else not items + + def test_arguments(): # Ensure passing no arguments works. crawler = get_crawler() @@ -420,6 +467,17 @@ def test_metadata(): "title": "URLs file", "type": "string", }, + "search_queries": { + "default": [], + "description": ( + "A list of search queries, one per line, to submit " + "using the search form found on each input URL." + ), + "items": {"type": "string"}, + "title": "Search Queries", + "type": "array", + "widget": "textarea", + }, "crawl_strategy": { "default": "automatic", "description": "Determines how the start URL and follow-up URLs are crawled.", @@ -820,6 +878,58 @@ def test_urls_file(): assert start_requests[2].url == "https://c.example" +def test_search_queries(): + crawler = get_crawler() + url = "https://example.com" + + spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo bar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo bar"] + + spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo\nbar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + spider = EcommerceSpider.from_crawler( + crawler, url=url, search_queries=["foo", "bar"] + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + +def test_search_queries_extract_from(): + crawler = get_crawler() + url = "https://example.com" + + spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = EcommerceSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="httpResponseBody" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = EcommerceSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="browserHtml" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].meta["inject"] == [BrowserResponse] + + @pytest.mark.parametrize( "url,has_full_domain", ( diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..c4554a8 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,666 @@ +import pytest +from pytest_twisted import ensureDeferred +from web_poet import AnyResponse, BrowserResponse, HttpResponse, PageParams + +from zyte_spider_templates.pages.search_request_template import ( + DefaultSearchRequestTemplatePage, +) + + +@pytest.mark.parametrize( + ("html", "page_params", "expected"), + ( + # Extruct #-----------------------------------------------------------# + # JSON-LD example from Google + # https://developers.google.com/search/docs/appearance/structured-data/sitelinks-searchbox#example + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Microdata example from Google + # https://developers.google.com/search/docs/appearance/structured-data/sitelinks-searchbox#example + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Non-compliant JSON-LD that uses a JSON array for potentialAction + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Non-default placeholder, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}&dont_replace={search_term_string}", + }, + ), + # Non-default placeholder, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}&dont_replace={search_term_string}", + }, + ), + # JSON-LD, WebSite isPartOf WebPage + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Relative URL, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/search?q={{ query|quote_plus }}", + }, + ), + # Relative URL, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/search?q={{ query|quote_plus }}", + }, + ), + # Wrong escaping in JSON-LD + ( + rb""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/search?a=b&q={{ query|quote_plus }}", + }, + ), + # Query in path, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/s/{{ query|urlencode }}", + }, + ), + # Relative URL, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/s/{{ query|urlencode }}", + }, + ), + # No potentialAction, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "error": "Cannot build a search request template", + }, + ), + # No potentialAction, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No SearchAction type, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No SearchAction type, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No target, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No target, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No query variable name, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # No query variable name, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Formasaurus and form heuristics #-----------------------------------# + *( + (html, {"search_request_builders": [builder]}, expected) + for builder in ("formasaurus", "form_heuristics") + for html, expected in ( + # Basic form + ( + b""" + + """, + { + "url": "https://example.com?q={{ query|quote_plus }}", + }, + ), + # No form + ( + b"
", + {"error": "Cannot build a search request template"}, + ), + # No named input field + ( + b""" + + """, + {"error": "Cannot build a search request template"}, + ), + # Multi-part form + ( + b""" + + """, + {"error": "Cannot build a search request template"}, + ), + # Non-HTML response (JSON) + ( + b"""{"a": "b"}""", + {"error": "Cannot build a search request template"}, + ), + ) + ), + # Link heuristics #---------------------------------------------------# + # Link with recognized parameters + *( + ( + f"""""", + {"search_request_builders": ["link_heuristics"]}, + {"error": "Cannot build a search request template"}, + ), + # No HTML (JSON) + ( + b"""{"a": "b"}""", + {"search_request_builders": ["link_heuristics"]}, + {"error": "Cannot build a search request template"}, + ), + # Parameter false positive (?q != q) + ( + b"""""", + {"search_request_builders": ["link_heuristics"]}, + {"error": "Cannot build a search request template"}, + ), + # Builder parameters #------------------------------------------------# + *( + ( + b""" +
+ + +
+ """, + page_params, + expected, + ) + for page_params, expected in ( + # By default, the popular builder strategy is used, meaning + # that even though the Extruct builder has the highest + # priority, if both the Formasaurus builder and the form + # heuristics builder output the same URL, that one is used + # instead. + ({}, {"url": "https://example.com/form?q={{ query|quote_plus }}"}), + ( + {"search_request_builder_strategy": "popular"}, + {"url": "https://example.com/form?q={{ query|quote_plus }}"}, + ), + ( + {"search_request_builder_strategy": "first"}, + {"url": "https://example.com/metadata?q={{ query|quote_plus }}"}, + ), + # Strategies only take into account the specified builders, and + # in the supplied order. + ( + { + "search_request_builder_strategy": "first", + "search_request_builders": ["formasaurus", "extruct"], + }, + {"url": "https://example.com/form?q={{ query|quote_plus }}"}, + ), + ( + { + "search_request_builder_strategy": "popular", + "search_request_builders": [ + "extruct", + "formasaurus", + "link_heuristics", + ], + }, + {"url": "https://example.com/metadata?q={{ query|quote_plus }}"}, + ), + # Unsupported strategies trigger a ValueError + ( + {"search_request_builder_strategy": "unsupported"}, + ValueError( + "Unsupported search_request_builder_strategy value: 'unsupported'" + ), + ), + ) + ), + ), +) +@ensureDeferred +async def test_search_request_template(html, page_params, expected, caplog): + caplog.clear() + caplog.at_level("ERROR") + + http_response = HttpResponse(url="https://example.com", status=200, body=html) + response = AnyResponse(response=http_response) + search_request_page = DefaultSearchRequestTemplatePage( + response=response, + page_params=PageParams(**page_params), + ) + try: + search_request = await search_request_page.to_item() + except Exception as exception: + assert isinstance(expected, Exception) + assert exception.__class__ == expected.__class__ + assert str(expected) in str(exception) + else: + if "error" in expected: + probability = search_request.get_probability() + assert probability is not None + assert probability <= 0.0 + assert expected["error"] in caplog.text + else: + assert isinstance(expected, dict) + assert expected["url"] == search_request.url + assert expected.get("body", b"") == (search_request.body or b"") + + +@ensureDeferred +async def test_search_request_template_browser(caplog): + """Do not suggest using a browser request if that is already the case.""" + caplog.clear() + caplog.at_level("ERROR") + + browser_response = BrowserResponse( + url="https://example.com", status=200, html="
" + ) + response = AnyResponse(response=browser_response) + search_request_page = DefaultSearchRequestTemplatePage( + response=response, page_params=PageParams() + ) + item = await search_request_page.to_item() + probability = item.get_probability() + assert probability is not None + assert probability <= 0.0 + assert "A quick workaround would be to use" in caplog.text diff --git a/tox.ini b/tox.ini index 3fa9108..a79c93b 100644 --- a/tox.ini +++ b/tox.ini @@ -20,13 +20,18 @@ commands = basepython = python3.9 deps = {[testenv]deps} + extruct==0.18.0 + form2request==0.2.0 + formasaurus==0.10.0 + jmespath==0.9.5 pydantic==2.1 - requests==0.10.1 + requests==1.0.0 scrapy==2.11.0 scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 - zyte-common-items==0.23.0 + web-poet==0.17.1 + zyte-common-items==0.25.0 [testenv:mypy] deps = diff --git a/zyte_spider_templates/pages/search_request_template.py b/zyte_spider_templates/pages/search_request_template.py new file mode 100644 index 0000000..f7a3653 --- /dev/null +++ b/zyte_spider_templates/pages/search_request_template.py @@ -0,0 +1,310 @@ +import html +import re +from collections import defaultdict +from logging import getLogger +from random import choice +from string import ascii_letters, digits +from urllib.parse import parse_qs, urlparse + +import attrs +import extruct +import formasaurus +import jmespath +from form2request import form2request +from lxml import etree +from scrapy.http.response.html import HtmlResponse +from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor +from w3lib.url import add_or_replace_parameters +from web_poet import AnyResponse, PageParams, handle_urls +from web_poet.pages import validates_input +from zyte_common_items import SearchRequestTemplate, SearchRequestTemplatePage + +logger = getLogger(__name__) + +# Because Jinja2 syntax gets percent-encoded in a URL, we instead use a +# placeholder made of URL-safe characters, and replace it with Jinja2 code +# after URL encoding. +# +# We use a random placeholder instead of a readable one to minimize risk of +# accidental conflict, and we generate it at run time to minimize risk of +# purposeful conflict. +_url_safe_chars = ascii_letters + digits +_PLACEHOLDER = "".join(choice(_url_safe_chars) for _ in range(32)) + + +def _any_http_response_to_scrapy_response(response: AnyResponse) -> HtmlResponse: + kwargs = {} + encoding = getattr(response, "_encoding", None) or "utf-8" + kwargs["encoding"] = encoding + kwargs["headers"] = getattr(response, "headers", {}) + return HtmlResponse( + url=str(response.url), body=response.text, status=response.status, **kwargs + ) + + +@handle_urls("", priority=250) +@attrs.define +class DefaultSearchRequestTemplatePage(SearchRequestTemplatePage): + response: AnyResponse # type: ignore[assignment] + page_params: PageParams + + def _item_from_form_heuristics(self): + form_xpath = """ + //form[ + descendant-or-self::*[ + contains(@action, "search") + or contains(@aria-label, "search") + or contains(@aria-labelledby, "search") + or contains(@class, "search") + or contains(@data-set, "search") + or contains(@formaction, "search") + or contains(@id, "search") + or contains(@role, "search") + or contains(@title, "search") + ] + ] + """ + forms = self.response.xpath(form_xpath) + if not forms: + raise ValueError("No search forms found.") + + field_xpath = """ + descendant::textarea + /@name + | descendant::input[ + not(@type) + or @type[ + not( + re:test( + ., + "^(?:checkbox|image|radio|reset|submit)$", + "i" + ) + ) + ] + ] + /@name + """ + search_query_field = None + for form in forms: + search_query_field = form.xpath(field_xpath).get() + if search_query_field: + break + if not search_query_field: + raise ValueError( + "No search query field found in any potential search form." + ) + data = {search_query_field: _PLACEHOLDER} + try: + request_data = form2request(form, data) + except NotImplementedError: + raise ValueError("form2request does not support the target search form") + return SearchRequestTemplate( + url=request_data.url.replace(_PLACEHOLDER, "{{ query|quote_plus }}"), + method=request_data.method, + headers=request_data.headers, + body=request_data.body.decode().replace( + _PLACEHOLDER, "{{ query|quote_plus }}" + ), + ) + + def _item_from_extruct(self): + metadata = extruct.extract( + self.response.text, + base_url=str(self.response.url), + syntaxes=["json-ld", "microdata"], + ) + query_field = None + for entry in metadata["microdata"]: + if not (actions := entry.get("properties", {}).get("potentialAction", {})): + continue + if not isinstance(actions, list): + actions = [actions] + for action in actions: + if action.get("type") != "https://schema.org/SearchAction": + continue + url_template = jmespath.search( + "properties.target.urlTemplate || properties.target", action + ) + if not url_template: + continue + query_input = action.get("properties", {}).get("query-input", {}) + query_field = query_input.get("valueName", "search_term_string") + break + if query_field: + break + if not query_field: + for entry in metadata["json-ld"]: + action = jmespath.search( + '"@graph"[].potentialAction || isPartOf.potentialAction || potentialAction', + entry, + ) + if not action: + continue + if isinstance(action, list): + action = jmespath.search( + '([?"@type"==`SearchAction`] | [0]) || @', action + ) + if not action or action.get("@type") != "SearchAction": + continue + url_template = jmespath.search("target.urlTemplate || target", action) + if not url_template: + continue + query_input = action.get( + "query-input", "required name=search_term_string" + ) + query_field_match = re.search(r"\bname=(\S+)", query_input) + if query_field_match: + query_field = query_field_match[1] + else: + query_field = "search_term_string" + break + if query_field: + break + if not query_field: + raise ValueError( + "Could not find HTML metadata to compose a search request template." + ) + parts = url_template.split("?", maxsplit=1) + parts[0] = parts[0].replace(f"{{{query_field}}}", "{{ query|urlencode }}") + if len(parts) > 1: + parts[1] = parts[1].replace(f"{{{query_field}}}", "{{ query|quote_plus }}") + url = "?".join(parts) + url = str(self.response.urljoin(url)) + url = html.unescape(url) + return SearchRequestTemplate( + url=url, + method="GET", + headers=[], + body="", + ) + + def _item_from_link_heuristics(self): + query_parameters = "|".join( + ( + r"[a-z]?(?:(?:field|search)[_-]?)?key(?:word)?s?", + r"[a-z]?(?:(?:field|search)[_-]?)?query", + r"[a-z]?(?:(?:field|search)[_-]?)?params?", + r"[a-z]?(?:(?:field|search)[_-]?)?terms?", + r"[a-z]?(?:(?:field|search)[_-]?)?text", + r"[a-z]?search", + r"qs?", + r"s", + ) + ) + param_regexp = f"(?i)^(?:{query_parameters})$" + url_regexp = f"(?i)[?&](?:{query_parameters})=(?!$)[^&]" + netloc = urlparse(str(self.response.url)).netloc + scrapy_response = _any_http_response_to_scrapy_response(self.response) + try: + search_links = LxmlLinkExtractor( + allow=url_regexp, allow_domains=netloc + ).extract_links(scrapy_response) + except AttributeError as exception: + raise ValueError(str(exception)) + if not search_links: + raise ValueError(f"No valid search links found on {self.response.url}") + for search_link in search_links: + query_string = urlparse(search_link.url).query + query = parse_qs(query_string) + search_params = set() + for k in query: + if re.search(param_regexp, k): + search_params.add(k) + if not search_params: + continue + url = add_or_replace_parameters( + search_link.url, {k: _PLACEHOLDER for k in search_params} + ) + url = url.replace(_PLACEHOLDER, "{{ query|quote_plus }}") + return SearchRequestTemplate( + url=url, + method="GET", + headers=[], + body="", + ) + raise ValueError(f"No valid search links found on {self.response.url}") + + def _item_from_formasaurus(self): + try: + form, data, submit_button = formasaurus.build_submission( + self.response.selector, + "search", + {"search query": _PLACEHOLDER}, + ) + except AttributeError as exception: + raise ValueError(str(exception)) + if not data: + form_excerpt = etree.tostring(form).decode()[:64] + if len(form_excerpt) >= 64: + form_excerpt = form_excerpt[:-1] + "…" + raise ValueError( + f"Did not find an input field for the search query in " + f"the most likely search form at {self.response.url} " + f"(form_excerpt)." + ) + try: + request_data = form2request(form, data, click=submit_button) + except NotImplementedError: + raise ValueError("form2request does not support the target search form") + return SearchRequestTemplate( + url=request_data.url.replace(_PLACEHOLDER, "{{ query|quote_plus }}"), + method=request_data.method, + headers=request_data.headers, + body=request_data.body.decode().replace( + _PLACEHOLDER, "{{ query|quote_plus }}" + ), + ) + + @validates_input + async def to_item(self) -> SearchRequestTemplate: + builders = { + "extruct": self._item_from_extruct, + "formasaurus": self._item_from_formasaurus, + "link_heuristics": self._item_from_link_heuristics, + "form_heuristics": self._item_from_form_heuristics, + } + builder_ids = self.page_params.get("search_request_builders", list(builders)) + builder_strategy = self.page_params.get( + "search_request_builder_strategy", "popular" + ) + if builder_strategy not in {"first", "popular"}: + raise ValueError( + f"Unsupported search_request_builder_strategy value: {builder_strategy!r}" + ) + results = defaultdict(list) + for builder_id in builder_ids: + builder = builders[builder_id] + try: + result = builder() + except ValueError: + continue + if result: + if builder_strategy == "first": + return result + results[(result.url, result.body)].append((builder_id, result)) + if results: + assert builder_strategy == "popular" + top_count = max(len(v) for v in results.values()) + top_results = { + builder_id: result + for result_list in results.values() + for builder_id, result in result_list + if len(result_list) == top_count + } + for builder_id in builder_ids: + if builder_id not in top_results: + continue + return top_results[builder_id] + + logger.error( + f"Cannot build a search request template for " + f"{self.response.url}. A quick workaround would be to use a " + f"search URL as input URL instead of using the search " + f"queries input field. You can also manually implement " + f"search support for a given website " + f"(https://zyte-common-items.readthedocs.io/en/latest/usage/re" + f"quest-templates.html#writing-a-request-template-page-object)" + f"." + ) + return self.no_item_found() diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index 0ef628a..0500ac8 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -108,6 +108,40 @@ class MaxRequestsParam(BaseModel): ) +class SearchQueriesParam(BaseModel): + search_queries: List[str] = Field( + title="Search Queries", + description=( + "A list of search queries, one per line, to submit using the " + "search form found on each input URL." + ), + default_factory=list, + json_schema_extra={ + "default": [], + "widget": "textarea", + }, + ) + + @field_validator("search_queries", mode="before") + @classmethod + def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: + """Validate a list of search queries. + + If a string is received as input, it is split into multiple strings + on new lines. + """ + if isinstance(value, str): + value = value.split("\n") + if not value: + return value + result = [] + for v in value: + if not (v := v.strip()): + continue + result.append(v) + return result + + INPUT_GROUP_FIELDS = ("url", "urls", "urls_file") INPUT_GROUP: JsonDict = { "id": "inputs", diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index 11e4acf..b4de089 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -15,6 +15,7 @@ ExtractFromParam, GeolocationParam, MaxRequestsParam, + SearchQueriesParam, UrlParam, UrlsFileParam, UrlsParam, @@ -33,6 +34,7 @@ class BaseSpiderParams( ExtractFromParam, MaxRequestsParam, GeolocationParam, + SearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, @@ -56,6 +58,7 @@ def deprecated(self): ), DeprecationWarning, ) + return self class BaseSpider(scrapy.Spider): diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index db3a5b0..586c364 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -4,19 +4,21 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast import scrapy -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args +from web_poet.page_inputs.browser import BrowserResponse from zyte_common_items import ( CustomAttributes, ProbabilityRequest, Product, ProductNavigation, + SearchRequestTemplate, ) from zyte_spider_templates.heuristics import is_homepage -from zyte_spider_templates.params import parse_input_params +from zyte_spider_templates.params import ExtractFrom, parse_input_params from zyte_spider_templates.spiders.base import ( ARG_SETTING_PRIORITY, INPUT_GROUP, @@ -31,6 +33,7 @@ ExtractFromParam, GeolocationParam, MaxRequestsParam, + SearchQueriesParam, UrlParam, UrlsFileParam, UrlsParam, @@ -153,6 +156,7 @@ class EcommerceSpiderParams( MaxRequestsParam, GeolocationParam, EcommerceCrawlStrategyParam, + SearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, @@ -166,6 +170,20 @@ class EcommerceSpiderParams( }, ) + @model_validator(mode="after") + def validate_direct_item_and_search_queries(self): + if self.search_queries and self.crawl_strategy in { + EcommerceCrawlStrategy.direct_item, + EcommerceCrawlStrategy.full, + EcommerceCrawlStrategy.navigation, + }: + raise ValueError( + f"Cannot combine the {self.crawl_strategy.value!r} value of " + f"the crawl_strategy spider parameter with the search_queries " + f"spider parameter." + ) + return self + class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): """Yield products from an e-commerce website. @@ -246,8 +264,38 @@ def get_start_request(self, url): ) def start_requests(self) -> Iterable[scrapy.Request]: - for url in self.start_urls: - yield self.get_start_request(url) + if self.args.search_queries: + for url in self.start_urls: + meta: Dict[str, Any] = { + "crawling_logs": {"page_type": "searchRequestTemplate"}, + } + if self.args.extract_from == ExtractFrom.browserHtml: + meta["inject"] = [BrowserResponse] + yield scrapy.Request( + url=url, + callback=self.parse_search_request_template, + meta=meta, + ) + else: + for url in self.start_urls: + yield self.get_start_request(url) + + def parse_search_request_template( + self, + response: DummyResponse, + search_request_template: SearchRequestTemplate, + dynamic: DynamicDeps, + ) -> Iterable[scrapy.Request]: + probability = search_request_template.get_probability() + if probability is not None and probability <= 0: + return + for query in self.args.search_queries: + yield search_request_template.request(query=query).to_scrapy( + callback=self.parse_navigation, + meta={ + "crawling_logs": {"page_type": "productNavigation"}, + }, + ) def parse_navigation( self, response: DummyResponse, navigation: ProductNavigation @@ -271,7 +319,10 @@ def parse_navigation( cast(ProbabilityRequest, navigation.nextPage) ) - if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only: + if ( + self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only + and not self.args.search_queries + ): for request in navigation.subCategories or []: yield self.get_subcategory_request(request, page_params=page_params)