Merge pull request #103 from zytedata/articles_to_main

Job postings + ProductList extraction
zytedata · Dec 16, 2024 · 5f5b59f · 5f5b59f
2 parents 262e603 + 8e040fa
commit 5f5b59f
Show file tree

Hide file tree

Showing 27 changed files with 1,848 additions and 145 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,7 +7,6 @@ on:
   push:
     branches: [ main ]
   pull_request:
-    branches: [ main ]
 
 jobs:
   test:

diff --git a/docs/conf.py b/docs/conf.py
@@ -76,3 +76,6 @@
 redirects = {
     "customization/page-objects": "pages.html",
 }
+
+# workaround for https://github.com/pydantic/pydantic/discussions/7763
+import zyte_spider_templates.spiders.job_posting  # noqa: F401, E402
diff --git a/docs/index.rst b/docs/index.rst
@@ -20,6 +20,7 @@ zyte-spider-templates documentation
    E-commerce <templates/e-commerce>
    Article <templates/article>
    Google search <templates/google-search>
+   Job posting <templates/job-posting>
 
 .. toctree::
    :caption: Features

diff --git a/docs/reference/api.rst b/docs/reference/api.rst
@@ -13,6 +13,8 @@ Spiders
 
 .. autoclass:: zyte_spider_templates.GoogleSearchSpider
 
+.. autoclass:: zyte_spider_templates.JobPostingSpider
+
 
 Pages
 =====
@@ -58,6 +60,11 @@ Parameter mixins
 
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
 
+.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceExtractParam
+    :exclude-members: model_computed_fields
+
+.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceExtract
+
 .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
     :exclude-members: model_computed_fields
 
@@ -71,6 +78,11 @@ Parameter mixins
 
 .. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy
 
+.. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategyParam
+    :exclude-members: model_computed_fields
+
+.. autoenum:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategy
+
 
 .. _middlewares:
 

diff --git a/docs/templates/job-posting.rst b/docs/templates/job-posting.rst
@@ -0,0 +1,19 @@
+.. _job-posting:
+
+=============================================
+Job posting spider template (``job_posting``)
+=============================================
+
+Basic use
+=========
+
+.. code-block:: shell
+
+    scrapy crawl job_posting -a url="https://books.toscrape.com"
+
+Parameters
+==========
+
+.. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams
+    :inherited-members: BaseModel
+    :exclude-members: model_computed_fields
diff --git a/setup.cfg b/setup.cfg
@@ -41,3 +41,4 @@ per-file-ignores =
     # E731: Ignore "do not assign a lambda expression, use a def" since
     # we're using quick shortcuts for the tests
     tests/test_ecommerce.py:E731
+    tests/test_job_posting.py:E731
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
         "scrapy>=2.11.0",
         "scrapy-poet>=0.24.0",
         "scrapy-spider-metadata>=0.2.0",
-        "scrapy-zyte-api[provider]>=0.23.0",
+        "scrapy-zyte-api[provider]>=0.24.0",
         "web-poet>=0.17.1",
         "xtractmime>=0.2.1",
         "zyte-common-items>=0.26.2",

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,19 +1,22 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Type
 
 import pytest
+from scrapy import Spider
 from scrapy.utils.test import TestSpider
 
 # https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
 pytest.register_assert_rewrite("tests.utils")
 
 
 # scrapy.utils.test.get_crawler alternative that does not freeze settings.
-def get_crawler(*, settings: Optional[Dict[str, Any]] = None):
+def get_crawler(
+    *, settings: Optional[Dict[str, Any]] = None, spider_cls: Type[Spider] = TestSpider
+):
     from scrapy.crawler import CrawlerRunner
 
     settings = settings or {}
     # Set by default settings that prevent deprecation warnings.
     settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7"
     runner = CrawlerRunner(settings)
-    crawler = runner.create_crawler(TestSpider)
+    crawler = runner.create_crawler(spider_cls)
     return crawler
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,9 @@
+import pytest
+
+
+@pytest.fixture(scope="session")
+def mockserver():
+    from .mockserver import MockServer
+
+    with MockServer() as server:
+        yield server
diff --git a/tests/incremental/test_collection_fp_manager.py b/tests/incremental/test_collection_fp_manager.py
@@ -1,8 +1,10 @@
+from asyncio import ensure_future
 from unittest.mock import MagicMock, patch
 
 import pytest
 from scrapy.statscollectors import StatsCollector
 from scrapy.utils.request import RequestFingerprinter
+from twisted.internet.defer import Deferred, inlineCallbacks
 
 from tests import get_crawler
 from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager
@@ -43,8 +45,8 @@ def crawler_for_incremental():
     ],
 )
 @patch("scrapinghub.ScrapinghubClient")
-@pytest.mark.asyncio
-async def test_get_existing_fingerprints(
+@inlineCallbacks
+def test_get_existing_fingerprints(
     mock_scrapinghub_client,
     batch_size,
     fingerprints,
@@ -68,10 +70,10 @@ async def test_get_existing_fingerprints(
     mock_manager.get_keys_from_collection = MagicMock(return_value=keys_in_collection)  # type: ignore
     mock_manager.batch = fingerprints_batch
 
-    assert (
-        await mock_manager.get_existing_fingerprints_async(fingerprints)
-        == expected_result
+    r = yield Deferred.fromFuture(
+        ensure_future(mock_manager.get_existing_fingerprints_async(fingerprints))
     )
+    assert r == expected_result
 
 
 @pytest.mark.parametrize(

diff --git a/tests/incremental/test_incremental_manager.py b/tests/incremental/test_incremental_manager.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pytest
+from pytest_twisted import ensureDeferred
 from scrapy.statscollectors import StatsCollector
 from scrapy.utils.request import RequestFingerprinter
 from zyte_common_items import Article
@@ -264,7 +265,7 @@ def crawler_for_incremental():
         ),  # Three Requests and one Item with redirected URL in the result, one existing fingerprint in the cache
     ],
 )
-@pytest.mark.asyncio
+@ensureDeferred
 async def test_process_incremental(
     mock_scrapinghub_client,
     input_request,
@@ -289,7 +290,7 @@ async def test_process_incremental(
 
 
 @patch("scrapinghub.ScrapinghubClient")
-@pytest.mark.asyncio
+@ensureDeferred
 async def test_process_incremental_several_items(
     mock_scrapinghub_client,
 ):

diff --git a/tests/incremental/test_middleware.py b/tests/incremental/test_middleware.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import pytest
+from pytest_twisted import ensureDeferred
 from scrapy.exceptions import CloseSpider, NotConfigured
 from scrapy.http import Request, Response
 from scrapy.settings import Settings
@@ -67,7 +68,7 @@ def test_prepare_manager_with_collection_fp_failure(caplog):
 
 
 @patch("scrapinghub.ScrapinghubClient")
-@pytest.mark.asyncio
+@ensureDeferred
 async def test_middleware_process_spider_output(mock_scrapinghub_client):
     crawler = crawler_for_incremental()
     crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})

diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -0,0 +1,178 @@
+import argparse
+import json
+import socket
+import sys
+import time
+from importlib import import_module
+from subprocess import PIPE, Popen
+from typing import Any, Dict
+
+from scrapy_zyte_api.responses import _API_RESPONSE
+from twisted.internet import reactor
+from twisted.web.resource import Resource
+from twisted.web.server import Site
+
+
+def get_ephemeral_port():
+    s = socket.socket()
+    s.bind(("", 0))
+    return s.getsockname()[1]
+
+
+class DefaultResource(Resource):
+    """Mock server to fake Zyte API responses.
+
+    To use, include the mockserver fixture in the signature of your test, and
+    point the ZYTE_API_URL setting to the mock server. See
+    ``tests/test_ecommerce.py::test_crawl_strategies`` for an example.
+
+    This mock server is designed to fake a website with the following pages:
+
+    ```
+    https://example.com/
+    https://example.com/page/2
+    https://example.com/category/1
+    https://example.com/category/1/page/2
+    https://example.com/non-navigation
+    ```
+
+    When browserHtml is requested (for any URL, listed above or not), it is
+    a minimal HTML with an anchor tag pointing to
+    https://example.com/non-navigation.
+
+    When productNavigation is requested, nextPage and subCategories are filled
+    accordingly. productNavigation.items always has 2 product URLs, which are
+    the result of appending ``/product/<n>`` to the request URL.
+    https://example.com/non-navigation is not reachable through
+    productNavigation.
+
+    When product or productList is requested, an item with the current URL is
+    always returned.
+
+    All output also includes unsupported links (mailto:…).
+    """
+
+    def getChild(self, path, request):
+        return self
+
+    def render_POST(self, request):
+        request_data = json.loads(request.content.read())
+        request.responseHeaders.setRawHeaders(
+            b"Content-Type",
+            [b"application/json"],
+        )
+        request.responseHeaders.setRawHeaders(
+            b"request-id",
+            [b"abcd1234"],
+        )
+
+        response_data: _API_RESPONSE = {}
+
+        response_data["url"] = request_data["url"]
+
+        non_navigation_url = "https://example.com/non-navigation"
+        html = f"""<html><body><a href="{non_navigation_url}"></a><a href="mailto:[email protected]"></a></body></html>"""
+        if request_data.get("browserHtml", False) is True:
+            response_data["browserHtml"] = html
+
+        if request_data.get("product", False) is True:
+            response_data["product"] = {
+                "url": request_data["url"],
+            }
+
+        if request_data.get("productList", False) is True:
+            response_data["productList"] = {
+                "url": request_data["url"],
+            }
+
+        if request_data.get("productNavigation", False) is True:
+            kwargs: Dict[str, Any] = {}
+            if (
+                "/page/" not in request_data["url"]
+                and "/non-navigation" not in request_data["url"]
+            ):
+                kwargs["nextPage"] = {
+                    "url": f"{request_data['url'].rstrip('/')}/page/2"
+                }
+                if "/category/" not in request_data["url"]:
+                    kwargs["subCategories"] = [
+                        {"url": "mailto:[email protected]"},
+                        {"url": f"{request_data['url'].rstrip('/')}/category/1"},
+                    ]
+            else:
+                kwargs["nextPage"] = {"url": "mailto:[email protected]"}
+            response_data["productNavigation"] = {
+                "url": request_data["url"],
+                "items": [
+                    {"url": "mailto:[email protected]"},
+                    {"url": f"{request_data['url'].rstrip('/')}/product/1"},
+                    {"url": f"{request_data['url'].rstrip('/')}/product/2"},
+                ],
+                **kwargs,
+            }
+
+        return json.dumps(response_data).encode()
+
+
+class MockServer:
+    def __init__(self, resource=None, port=None):
+        resource = resource or DefaultResource
+        self.resource = "{}.{}".format(resource.__module__, resource.__name__)
+        self.proc = None
+        self.host = socket.gethostbyname(socket.gethostname())
+        self.port = port or get_ephemeral_port()
+        self.root_url = "http://%s:%d" % (self.host, self.port)
+
+    def __enter__(self):
+        self.proc = Popen(
+            [
+                sys.executable,
+                "-u",
+                "-m",
+                "tests.mockserver",
+                self.resource,
+                "--port",
+                str(self.port),
+            ],
+            stdout=PIPE,
+        )
+        assert self.proc.stdout is not None
+        self.proc.stdout.readline()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        assert self.proc is not None
+        self.proc.kill()
+        self.proc.wait()
+        time.sleep(0.2)
+
+    def urljoin(self, path):
+        return self.root_url + path
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("resource")
+    parser.add_argument("--port", type=int)
+    args = parser.parse_args()
+    module_name, name = args.resource.rsplit(".", 1)
+    sys.path.append(".")
+    resource = getattr(import_module(module_name), name)()
+    # Typing issue: https://github.com/twisted/twisted/issues/9909
+    http_port = reactor.listenTCP(args.port, Site(resource))  # type: ignore[attr-defined]
+
+    def print_listening():
+        host = http_port.getHost()
+        print(
+            "Mock server {} running at http://{}:{}".format(
+                resource, host.host, host.port
+            )
+        )
+
+    # Typing issue: https://github.com/twisted/twisted/issues/9909
+    reactor.callWhenRunning(print_listening)  # type: ignore[attr-defined]
+    reactor.run()  # type: ignore[attr-defined]
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,6 @@ on: @@
       push:
         branches: [ main ]
       pull_request:
-        branches: [ main ]
     jobs:
       test:
@@ Expand Down @@