Skip to content

Commit

Permalink
Merge pull request #103 from zytedata/articles_to_main
Browse files Browse the repository at this point in the history
Job postings + ProductList extraction
  • Loading branch information
kmike authored Dec 16, 2024
2 parents 262e603 + 8e040fa commit 5f5b59f
Show file tree
Hide file tree
Showing 27 changed files with 1,848 additions and 145 deletions.
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
test:
Expand Down
3 changes: 3 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,6 @@
redirects = {
"customization/page-objects": "pages.html",
}

# workaround for https://github.com/pydantic/pydantic/discussions/7763
import zyte_spider_templates.spiders.job_posting # noqa: F401, E402
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ zyte-spider-templates documentation
E-commerce <templates/e-commerce>
Article <templates/article>
Google search <templates/google-search>
Job posting <templates/job-posting>

.. toctree::
:caption: Features
Expand Down
12 changes: 12 additions & 0 deletions docs/reference/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ Spiders

.. autoclass:: zyte_spider_templates.GoogleSearchSpider

.. autoclass:: zyte_spider_templates.JobPostingSpider


Pages
=====
Expand Down Expand Up @@ -58,6 +60,11 @@ Parameter mixins

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceExtractParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceExtract

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
:exclude-members: model_computed_fields

Expand All @@ -71,6 +78,11 @@ Parameter mixins

.. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategyParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.job_posting.JobPostingCrawlStrategy


.. _middlewares:

Expand Down
19 changes: 19 additions & 0 deletions docs/templates/job-posting.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.. _job-posting:

=============================================
Job posting spider template (``job_posting``)
=============================================

Basic use
=========

.. code-block:: shell
scrapy crawl job_posting -a url="https://books.toscrape.com"
Parameters
==========

.. autopydantic_model:: zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams
:inherited-members: BaseModel
:exclude-members: model_computed_fields
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ per-file-ignores =
# E731: Ignore "do not assign a lambda expression, use a def" since
# we're using quick shortcuts for the tests
tests/test_ecommerce.py:E731
tests/test_job_posting.py:E731
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"scrapy>=2.11.0",
"scrapy-poet>=0.24.0",
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"scrapy-zyte-api[provider]>=0.24.0",
"web-poet>=0.17.1",
"xtractmime>=0.2.1",
"zyte-common-items>=0.26.2",
Expand Down
9 changes: 6 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Type

import pytest
from scrapy import Spider
from scrapy.utils.test import TestSpider

# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#assertion-rewriting
pytest.register_assert_rewrite("tests.utils")


# scrapy.utils.test.get_crawler alternative that does not freeze settings.
def get_crawler(*, settings: Optional[Dict[str, Any]] = None):
def get_crawler(
*, settings: Optional[Dict[str, Any]] = None, spider_cls: Type[Spider] = TestSpider
):
from scrapy.crawler import CrawlerRunner

settings = settings or {}
# Set by default settings that prevent deprecation warnings.
settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7"
runner = CrawlerRunner(settings)
crawler = runner.create_crawler(TestSpider)
crawler = runner.create_crawler(spider_cls)
return crawler
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pytest


@pytest.fixture(scope="session")
def mockserver():
from .mockserver import MockServer

with MockServer() as server:
yield server
12 changes: 7 additions & 5 deletions tests/incremental/test_collection_fp_manager.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from asyncio import ensure_future
from unittest.mock import MagicMock, patch

import pytest
from scrapy.statscollectors import StatsCollector
from scrapy.utils.request import RequestFingerprinter
from twisted.internet.defer import Deferred, inlineCallbacks

from tests import get_crawler
from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager
Expand Down Expand Up @@ -43,8 +45,8 @@ def crawler_for_incremental():
],
)
@patch("scrapinghub.ScrapinghubClient")
@pytest.mark.asyncio
async def test_get_existing_fingerprints(
@inlineCallbacks
def test_get_existing_fingerprints(
mock_scrapinghub_client,
batch_size,
fingerprints,
Expand All @@ -68,10 +70,10 @@ async def test_get_existing_fingerprints(
mock_manager.get_keys_from_collection = MagicMock(return_value=keys_in_collection) # type: ignore
mock_manager.batch = fingerprints_batch

assert (
await mock_manager.get_existing_fingerprints_async(fingerprints)
== expected_result
r = yield Deferred.fromFuture(
ensure_future(mock_manager.get_existing_fingerprints_async(fingerprints))
)
assert r == expected_result


@pytest.mark.parametrize(
Expand Down
5 changes: 3 additions & 2 deletions tests/incremental/test_incremental_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch

import pytest
from pytest_twisted import ensureDeferred
from scrapy.statscollectors import StatsCollector
from scrapy.utils.request import RequestFingerprinter
from zyte_common_items import Article
Expand Down Expand Up @@ -264,7 +265,7 @@ def crawler_for_incremental():
), # Three Requests and one Item with redirected URL in the result, one existing fingerprint in the cache
],
)
@pytest.mark.asyncio
@ensureDeferred
async def test_process_incremental(
mock_scrapinghub_client,
input_request,
Expand All @@ -289,7 +290,7 @@ async def test_process_incremental(


@patch("scrapinghub.ScrapinghubClient")
@pytest.mark.asyncio
@ensureDeferred
async def test_process_incremental_several_items(
mock_scrapinghub_client,
):
Expand Down
3 changes: 2 additions & 1 deletion tests/incremental/test_middleware.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch

import pytest
from pytest_twisted import ensureDeferred
from scrapy.exceptions import CloseSpider, NotConfigured
from scrapy.http import Request, Response
from scrapy.settings import Settings
Expand Down Expand Up @@ -67,7 +68,7 @@ def test_prepare_manager_with_collection_fp_failure(caplog):


@patch("scrapinghub.ScrapinghubClient")
@pytest.mark.asyncio
@ensureDeferred
async def test_middleware_process_spider_output(mock_scrapinghub_client):
crawler = crawler_for_incremental()
crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True})
Expand Down
178 changes: 178 additions & 0 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import argparse
import json
import socket
import sys
import time
from importlib import import_module
from subprocess import PIPE, Popen
from typing import Any, Dict

from scrapy_zyte_api.responses import _API_RESPONSE
from twisted.internet import reactor
from twisted.web.resource import Resource
from twisted.web.server import Site


def get_ephemeral_port():
s = socket.socket()
s.bind(("", 0))
return s.getsockname()[1]


class DefaultResource(Resource):
"""Mock server to fake Zyte API responses.
To use, include the mockserver fixture in the signature of your test, and
point the ZYTE_API_URL setting to the mock server. See
``tests/test_ecommerce.py::test_crawl_strategies`` for an example.
This mock server is designed to fake a website with the following pages:
```
https://example.com/
https://example.com/page/2
https://example.com/category/1
https://example.com/category/1/page/2
https://example.com/non-navigation
```
When browserHtml is requested (for any URL, listed above or not), it is
a minimal HTML with an anchor tag pointing to
https://example.com/non-navigation.
When productNavigation is requested, nextPage and subCategories are filled
accordingly. productNavigation.items always has 2 product URLs, which are
the result of appending ``/product/<n>`` to the request URL.
https://example.com/non-navigation is not reachable through
productNavigation.
When product or productList is requested, an item with the current URL is
always returned.
All output also includes unsupported links (mailto:…).
"""

def getChild(self, path, request):
return self

def render_POST(self, request):
request_data = json.loads(request.content.read())
request.responseHeaders.setRawHeaders(
b"Content-Type",
[b"application/json"],
)
request.responseHeaders.setRawHeaders(
b"request-id",
[b"abcd1234"],
)

response_data: _API_RESPONSE = {}

response_data["url"] = request_data["url"]

non_navigation_url = "https://example.com/non-navigation"
html = f"""<html><body><a href="{non_navigation_url}"></a><a href="mailto:[email protected]"></a></body></html>"""
if request_data.get("browserHtml", False) is True:
response_data["browserHtml"] = html

if request_data.get("product", False) is True:
response_data["product"] = {
"url": request_data["url"],
}

if request_data.get("productList", False) is True:
response_data["productList"] = {
"url": request_data["url"],
}

if request_data.get("productNavigation", False) is True:
kwargs: Dict[str, Any] = {}
if (
"/page/" not in request_data["url"]
and "/non-navigation" not in request_data["url"]
):
kwargs["nextPage"] = {
"url": f"{request_data['url'].rstrip('/')}/page/2"
}
if "/category/" not in request_data["url"]:
kwargs["subCategories"] = [
{"url": "mailto:[email protected]"},
{"url": f"{request_data['url'].rstrip('/')}/category/1"},
]
else:
kwargs["nextPage"] = {"url": "mailto:[email protected]"}
response_data["productNavigation"] = {
"url": request_data["url"],
"items": [
{"url": "mailto:[email protected]"},
{"url": f"{request_data['url'].rstrip('/')}/product/1"},
{"url": f"{request_data['url'].rstrip('/')}/product/2"},
],
**kwargs,
}

return json.dumps(response_data).encode()


class MockServer:
def __init__(self, resource=None, port=None):
resource = resource or DefaultResource
self.resource = "{}.{}".format(resource.__module__, resource.__name__)
self.proc = None
self.host = socket.gethostbyname(socket.gethostname())
self.port = port or get_ephemeral_port()
self.root_url = "http://%s:%d" % (self.host, self.port)

def __enter__(self):
self.proc = Popen(
[
sys.executable,
"-u",
"-m",
"tests.mockserver",
self.resource,
"--port",
str(self.port),
],
stdout=PIPE,
)
assert self.proc.stdout is not None
self.proc.stdout.readline()
return self

def __exit__(self, exc_type, exc_value, traceback):
assert self.proc is not None
self.proc.kill()
self.proc.wait()
time.sleep(0.2)

def urljoin(self, path):
return self.root_url + path


def main():
parser = argparse.ArgumentParser()
parser.add_argument("resource")
parser.add_argument("--port", type=int)
args = parser.parse_args()
module_name, name = args.resource.rsplit(".", 1)
sys.path.append(".")
resource = getattr(import_module(module_name), name)()
# Typing issue: https://github.com/twisted/twisted/issues/9909
http_port = reactor.listenTCP(args.port, Site(resource)) # type: ignore[attr-defined]

def print_listening():
host = http_port.getHost()
print(
"Mock server {} running at http://{}:{}".format(
resource, host.host, host.port
)
)

# Typing issue: https://github.com/twisted/twisted/issues/9909
reactor.callWhenRunning(print_listening) # type: ignore[attr-defined]
reactor.run() # type: ignore[attr-defined]


if __name__ == "__main__":
main()
Loading

0 comments on commit 5f5b59f

Please sign in to comment.