diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ab3146fb..556f0b50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,21 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + include: + - python-version: "3.7" + toxenv: "min" + - python-version: "3.7" + toxenv: "asyncio-min" + + - python-version: "3.8" + toxenv: "py" + - python-version: "3.9" + toxenv: "py" + + - python-version: "3.10" + toxenv: "py" + - python-version: "3.10" + toxenv: "asyncio" steps: - uses: actions/checkout@v2 @@ -29,6 +43,8 @@ jobs: python -m pip install --upgrade pip python -m pip install tox - name: tox + env: + TOXENV: ${{ matrix.toxenv }} run: | tox -e py - name: coverage diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3bd7b8d8..275b29b9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,12 @@ TBR --- * Use the new ``web_poet.HttpResponse`` which replaces ``web_poet.ResponseData``. +* Support for the new features in ``web_poet>=0.2.0`` for supporting additional + requests inside Page Objects: + + * Created new providers for ``web_poet.PageParams`` and + ``web_poet.HttpClient``. + * The minimum Scrapy version is now ``2.6.0``. * We have these **backward incompatible** changes since the ``web_poet.OverrideRule`` follow a different structure: @@ -15,6 +21,8 @@ TBR * This resuls in a newer format in the ``SCRAPY_POET_OVERRIDES`` setting. * Removal of this deprecated module: ``scrapy.utils.reqser`` +* add ``async`` support for ``callback_for``. + 0.3.0 (2022-01-28) ------------------ diff --git a/README.rst b/README.rst index 919d31ab..d305724b 100644 --- a/README.rst +++ b/README.rst @@ -36,3 +36,27 @@ License is BSD 3-clause. * Issue tracker: https://github.com/scrapinghub/scrapy-poet/issues .. _`web-poet`: https://github.com/scrapinghub/web-poet + + +Quick Start +*********** + +Installation +============ + +.. code-block:: + + pip install scrapy-poet + +Requires **Python 3.7+** and **Scrapy >= 2.6.0**. + +Usage in a Scrapy Project +========================= + +Add the following inside Scrapy's ``settings.py`` file: + +.. code-block:: python + + DOWNLOADER_MIDDLEWARES = { + "scrapy_poet.InjectionMiddleware": 543, + } diff --git a/docs/conf.py b/docs/conf.py index 2e205d04..84466f9d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/docs/index.rst b/docs/index.rst index 30c17ff4..1271bbe5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,7 +35,8 @@ To get started, see :ref:`intro-install` and :ref:`intro-tutorial`. :maxdepth: 1 intro/install - intro/tutorial + intro/basic-tutorial + intro/advanced-tutorial .. toctree:: :caption: Advanced diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst new file mode 100644 index 00000000..a1fd2fa5 --- /dev/null +++ b/docs/intro/advanced-tutorial.rst @@ -0,0 +1,168 @@ +.. _`intro-advanced-tutorial`: + +================= +Advanced Tutorial +================= + +This section intends to go over the supported features in **web-poet** by +**scrapy-poet**: + + * ``web_poet.HttpClient`` + * ``web_poet.PageParams`` + +These are mainly achieved by **scrapy-poet** implementing **providers** for them: + + * :class:`scrapy_poet.page_input_providers.HttpClientProvider` + * :class:`scrapy_poet.page_input_providers.PageParamsProvider` + +.. _`intro-additional-requests`: + +Additional Requests +=================== + +Using Page Objects using additional requests doesn't need anything special from +the spider. It would work as-is because of the readily available +:class:`scrapy_poet.page_input_providers.HttpClientProvider` that is enabled +out of the box. + +This supplies the Page Object with the necessary ``web_poet.HttpClient`` instance. + +The HTTP client implementation that **scrapy-poet** provides to +``web_poet.HttpClient`` handles requests as follows: + +- Requests go through downloader middlewares, but they do not go through + spider middlewares or through the scheduler. + +- Duplicate requests are not filtered out. + +- In line with the web-poet specification for additional requests, + ``Request.meta['dont_redirect']`` is set to ``True`` for requests with the + ``HEAD`` HTTP method. + +Suppose we have the following Page Object: + +.. code-block:: python + + import attr + import web_poet + + + @attr.define + class ProductPage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + + async def to_item(self): + item = { + "url": self.url, + "name": self.css("#main h3.name ::text").get(), + "product_id": self.css("#product ::attr(product-id)").get(), + } + + # Simulates clicking on a button that says "View All Images" + response: web_poet.HttpResponse = await self.http_client.get( + f"https://api.example.com/v2/images?id={item['product_id']}" + ) + item["images"] = response.css(".product-images img::attr(src)").getall() + return item + + +It can be directly used inside the spider as: + +.. code-block:: python + + import scrapy + + + class ProductSpider(scrapy.Spider): + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + } + } + + start_urls = [ + "https://example.com/category/product/item?id=123", + "https://example.com/category/product/item?id=989", + ] + + async def parse(self, response, page: ProductPage): + return await page.to_item() + +Note that we needed to update the ``parse()`` method to be an ``async`` method, +since the ``to_item()`` method of the Page Object we're using is an ``async`` +method as well. + + +Page params +=========== + +Using ``web_poet.PageParams`` allows the Scrapy spider to pass any arbitrary +information into the Page Object. + +Suppose we update the earlier Page Object to control the additional request. +This basically acts as a switch to update the behavior of the Page Object: + +.. code-block:: python + + import attr + import web_poet + + + @attr.define + class ProductPage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + page_params: web_poet.PageParams + + async def to_item(self): + item = { + "url": self.url, + "name": self.css("#main h3.name ::text").get(), + "product_id": self.css("#product ::attr(product-id)").get(), + } + + # Simulates clicking on a button that says "View All Images" + if self.page_params.get("enable_extracting_all_images") + response: web_poet.HttpResponse = await self.http_client.get( + f"https://api.example.com/v2/images?id={item['product_id']}" + ) + item["images"] = response.css(".product-images img::attr(src)").getall() + + return item + +Passing the ``enable_extracting_all_images`` page parameter from the spider +into the Page Object can be achieved by using **Scrapy's** ``Request.meta`` +attribute. Specifically, any ``dict`` value inside the ``page_params`` +parameter inside **Scrapy's** ``Request.meta`` will be passed into +``web_poet.PageParams``. + +Let's see it in action: + +.. code-block:: python + + import scrapy + + + class ProductSpider(scrapy.Spider): + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + } + } + + start_urls = [ + "https://example.com/category/product/item?id=123", + "https://example.com/category/product/item?id=989", + ] + + def start_requests(self): + for url in start_urls: + yield scrapy.Request( + url=url, + callback=self.parse, + meta={"page_params": {"enable_extracting_all_images": True}} + ) + + async def parse(self, response, page: ProductPage): + return await page.to_item() diff --git a/docs/intro/tutorial.rst b/docs/intro/basic-tutorial.rst similarity index 92% rename from docs/intro/tutorial.rst rename to docs/intro/basic-tutorial.rst index 2be5d731..6a37c548 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -1,8 +1,8 @@ -.. _`intro-tutorial`: +.. _`intro-basic-tutorial`: -======== -Tutorial -======== +============== +Basic Tutorial +============== In this tutorial, we’ll assume that ``scrapy-poet`` is already installed on your system. If that’s not the case, see :ref:`intro-install`. @@ -198,6 +198,42 @@ returning the result of the ``to_item`` method call. We could use ``response.follow_all(links, callback_for(BookPage))``, without creating an attribute, but currently it won't work with Scrapy disk queues. +.. tip:: + + :func:`~.callback_for` also supports `async generators`. So if we have the + following: + + .. code-block:: python + + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) + + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + + It could be turned into: + + .. code-block:: python + + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) + + parse_book = callback_for(BookPage) + + This is useful when the Page Objects uses additional requests, which rely + heavily on ``async/await`` syntax. More info on this in this tutorial + section: :ref:`intro-additional-requests`. + Final result ============ diff --git a/docs/intro/install.rst b/docs/intro/install.rst index 5cec1f17..f3d6187e 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -16,7 +16,7 @@ If you’re already familiar with installation of Python packages, you can insta pip install scrapy-poet -Scrapy 2.1.0 or above is required and it has to be installed separately. +Scrapy 2.6.0 or above is required and it has to be installed separately. Things that are good to know ============================ diff --git a/docs/requirements.txt b/docs/requirements.txt index e6337937..99443b22 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ -Scrapy >= 2.1.0 +Scrapy >= 2.6.0 Sphinx >= 3.0.3 sphinx-rtd-theme >= 0.4 diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 53454809..d09259b5 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,4 +1,5 @@ from typing import Callable, Optional, Type +from inspect import iscoroutinefunction from scrapy.http import Request, Response @@ -55,6 +56,38 @@ def parse_book(self, response: DummyResponse, page: BookPage): It allows to write this: + .. code-block:: python + + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) + + parse_book = callback_for(BookPage) + + It also supports producing an async generator callable if the Page Objects's + ``to_item()`` method is a coroutine which uses the ``async/await`` syntax. + + So if we have the following: + + .. code-block:: python + + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) + + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + + It could be turned into: + .. code-block:: python class BooksSpider(scrapy.Spider): @@ -90,5 +123,12 @@ def parse(self, response): def parse(*args, page: page_cls, **kwargs): # type: ignore yield page.to_item() # type: ignore + async def async_parse(*args, page: page_cls, **kwargs): # type: ignore + yield await page.to_item() # type: ignore + + if iscoroutinefunction(page_cls.to_item): + setattr(async_parse, _CALLBACK_FOR_MARKER, True) + return async_parse + setattr(parse, _CALLBACK_FOR_MARKER, True) return parse diff --git a/scrapy_poet/downloader.py b/scrapy_poet/downloader.py new file mode 100644 index 00000000..b6ae534f --- /dev/null +++ b/scrapy_poet/downloader.py @@ -0,0 +1,52 @@ +import logging + +import scrapy +from scrapy.utils.defer import maybe_deferred_to_future +from web_poet import HttpRequest +from web_poet.exceptions import ( + HttpError, + HttpRequestError, +) + +from scrapy_poet.utils import ( + http_request_to_scrapy_request, + scrapy_response_to_http_response, +) + +logger = logging.getLogger(__name__) + + +def create_scrapy_downloader(download_func): + async def scrapy_downloader(request: HttpRequest): + if not isinstance(request, HttpRequest): + raise TypeError( + f"The request should be 'web_poet.HttpRequest' but received " + f"one of type: '{type(request)}'." + ) + + scrapy_request = http_request_to_scrapy_request(request) + + if scrapy_request.method == "HEAD": + scrapy_request.meta["dont_redirect"] = True + + deferred = download_func(scrapy_request) + deferred_or_future = maybe_deferred_to_future(deferred) + try: + response = await deferred_or_future + except scrapy.exceptions.IgnoreRequest as e: + # A Scrapy downloader middleware has caused the request to be + # ignored. + message = f"Additional request ignored: {scrapy_request}" + raise HttpError(message) from e + except Exception as e: + # This could be caused either by network errors (Twisted + # exceptions, OpenSSL, exceptions, etc.) or by unhandled exceptions + # in Scrapy downloader middlewares. We assume the former (raise + # HttpRequestError instead of HttpError), it being the most likely, + # and the latter only happening due to badly written code. + message = f"Additional request failed: {scrapy_request}" + raise HttpRequestError(message) from e + + return scrapy_response_to_http_response(response) + + return scrapy_downloader diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 9b9978c8..fde14af2 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -13,8 +13,12 @@ from scrapy.utils.misc import create_instance, load_object from .api import DummyResponse +from .page_input_providers import ( + HttpClientProvider, + HttpResponseProvider, + PageParamsProvider, +) from .overrides import OverridesRegistry -from .page_input_providers import HttpResponseProvider from .injection import Injector @@ -22,7 +26,9 @@ DEFAULT_PROVIDERS = { - HttpResponseProvider: 500 + HttpResponseProvider: 500, + HttpClientProvider: 600, + PageParamsProvider: 700, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 0a258d00..f47bec2a 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -10,7 +10,7 @@ """ import abc import json -from typing import Set, Union, Callable, ClassVar, List, Any, Sequence +from typing import Set, Union, Callable, ClassVar, Any, Sequence import attr from scrapy import Request @@ -18,8 +18,10 @@ from scrapy.crawler import Crawler from scrapy.utils.request import request_fingerprint +from scrapy_poet.utils import scrapy_response_to_http_response from scrapy_poet.injection_errors import MalformedProvidedClassesError -from web_poet import HttpResponse, HttpResponseHeaders +from scrapy_poet.downloader import create_scrapy_downloader +from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams class PageObjectInputProvider: @@ -197,3 +199,27 @@ def deserialize(self, data: Any) -> Sequence[Any]: ) for response_data in data ] + + +class HttpClientProvider(PageObjectInputProvider): + """This class provides ``web_poet.requests.HttpClient`` instances.""" + provided_classes = {HttpClient} + + def __call__(self, to_provide: Set[Callable], crawler: Crawler): + """Creates an ``web_poet.requests.HttpClient`` instance using Scrapy's + downloader. + """ + downloader = create_scrapy_downloader(crawler.engine.download) + return [HttpClient(request_downloader=downloader)] + + +class PageParamsProvider(PageObjectInputProvider): + """This class provides ``web_poet.page_inputs.PageParams`` instances.""" + provided_classes = {PageParams} + + def __call__(self, to_provide: Set[Callable], request: Request): + """Creates a ``web_poet.requests.PageParams`` instance based on the + data found from the ``meta["page_params"]`` field of a + ``scrapy.http.Response`` instance. + """ + return [PageParams(request.meta.get("page_params", {}))] diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 80a7d715..3b79a560 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,5 +1,8 @@ import os +import attr +from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders +from scrapy.http import Request, Response from scrapy.utils.project import project_data_dir, inside_project @@ -14,3 +17,30 @@ def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") - if createdir: os.makedirs(path, exist_ok=True) return path + + +def http_request_to_scrapy_request(request: HttpRequest, **kwargs) -> Request: + return Request( + url=str(request.url), + method=request.method, + headers=request.headers, + body=request.body, + **kwargs, + ) + + +def scrapy_response_to_http_response(response: Response) -> HttpResponse: + """Convenience method to convert a ``scrapy.http.Response`` into a + ``web_poet.HttpResponse``. + """ + kwargs = {} + encoding = getattr(response, "_encoding", None) + if encoding: + kwargs["encoding"] = encoding + return HttpResponse( + url=response.url, + body=response.body, + status=response.status, + headers=HttpResponseHeaders.from_bytes_dict(response.headers), + **kwargs, + ) diff --git a/setup.py b/setup.py index 39e5a000..930ebaf2 100755 --- a/setup.py +++ b/setup.py @@ -12,12 +12,12 @@ packages=find_packages(exclude=['tests', 'example']), install_requires=[ 'andi >= 0.4.1', - 'attrs', - 'parsel', - 'url-matcher', - 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet', - 'tldextract', - 'sqlitedict', + 'attrs >= 21.3.0', + 'parsel >= 1.5.0', + 'scrapy >= 2.6.0', + 'sqlitedict >= 1.5.0', + 'url-matcher >= 0.2.0', + 'web-poet >= 0.2.0', ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 12c37d6e..a16990b5 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -1,5 +1,6 @@ import scrapy import pytest +from pytest_twisted import ensureDeferred from web_poet.pages import ItemPage, ItemWebPage from scrapy_poet import ( @@ -13,6 +14,11 @@ class FakeItemPage(ItemPage): def to_item(self): return 'fake item page' +class FakeItemPageAsync(ItemPage): + + async def to_item(self): + return 'fake item page' + class FakeItemWebPage(ItemWebPage): @@ -27,6 +33,12 @@ class MySpider(scrapy.Spider): parse_web = callback_for(FakeItemWebPage) +class MySpiderAsync(scrapy.Spider): + + name = 'my_spider_async' + parse_item = callback_for(FakeItemPageAsync) + + def test_callback_for(): """Simple test case to ensure it works as expected.""" cb = callback_for(FakeItemPage) @@ -38,6 +50,20 @@ def test_callback_for(): assert list(result) == ['fake item page'] +@ensureDeferred +async def test_callback_for_async(): + cb = callback_for(FakeItemPageAsync) + assert callable(cb) + + fake_page = FakeItemPageAsync() + response = DummyResponse('http://example.com/') + result = cb(response=response, page=fake_page) + + assert await result.__anext__() == 'fake item page' + with pytest.raises(StopAsyncIteration): + assert await result.__anext__() + + def test_callback_for_instance_method(): spider = MySpider() response = DummyResponse('http://example.com/') @@ -46,12 +72,16 @@ def test_callback_for_instance_method(): assert list(result) == ['fake item page'] -def test_callback_for_inline(): - callback = callback_for(FakeItemPage) +@ensureDeferred +async def test_callback_for_instance_method_async(): + spider = MySpiderAsync() response = DummyResponse('http://example.com/') - fake_page = FakeItemPage() - result = callback(response, page=fake_page) - assert list(result) == ['fake item page'] + fake_page = FakeItemPageAsync() + result = spider.parse_item(response, page=fake_page) + + assert await result.__anext__() == 'fake item page' + with pytest.raises(StopAsyncIteration): + assert await result.__anext__() def test_default_callback(): @@ -92,6 +122,18 @@ def test_inline_callback(): assert str(exc.value) == msg +def test_inline_callback_async(): + """Sample request with inline callback using async callback_for.""" + spider = MySpiderAsync() + cb = callback_for(FakeItemPageAsync) + request = scrapy.Request('http://example.com/', callback=cb) + with pytest.raises(ValueError) as exc: + request.to_dict(spider=spider) + + msg = f'Function {cb} is not an instance method in: {spider}' + assert str(exc.value) == msg + + def test_invalid_subclass(): """Classes should inherit from ItemPage.""" diff --git a/tests/test_downloader.py b/tests/test_downloader.py new file mode 100644 index 00000000..bd74a791 --- /dev/null +++ b/tests/test_downloader.py @@ -0,0 +1,447 @@ +import attr +from functools import partial +from unittest import mock + +import pytest +import scrapy +import twisted +import web_poet +from pytest_twisted import ensureDeferred, inlineCallbacks +from scrapy import Request, Spider +from scrapy.exceptions import IgnoreRequest +from tests.utils import AsyncMock +from twisted.internet import reactor +from twisted.internet.task import deferLater +from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET +from web_poet import HttpClient +from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError +from web_poet.pages import ItemWebPage + +from scrapy_poet.downloader import create_scrapy_downloader +from scrapy_poet.utils import http_request_to_scrapy_request +from tests.utils import ( + crawl_single_item, make_crawler, HtmlResource, MockServer +) + + +@pytest.fixture +def scrapy_downloader(): + mock_downloader = AsyncMock() + return create_scrapy_downloader(mock_downloader) + + +@ensureDeferred +async def test_incompatible_scrapy_request(scrapy_downloader): + """The Request must be web_poet.HttpRequest and not anything else.""" + + req = scrapy.Request("https://example.com") + + with pytest.raises(TypeError): + await scrapy_downloader(req) + + +@pytest.fixture +def fake_http_response(): + return web_poet.HttpResponse( + "https://example.com", + b"some content", + status=200, + headers={"Content-Type": "text/html; charset=utf-8"}, + ) + + +@ensureDeferred +async def test_scrapy_poet_downloader(fake_http_response): + req = web_poet.HttpRequest("https://example.com") + + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + + mock_dtf.return_value = fake_http_response + + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_downloader = create_scrapy_downloader(mock_downloader) + + response = await scrapy_downloader(req) + + mock_downloader.assert_called_once() + assert isinstance(response, web_poet.HttpResponse) + + assert str(response.url) == "https://example.com" + assert response.text == "some content" + assert response.status == 200 + assert response.headers.get("Content-Type") == "text/html; charset=utf-8" + assert len(response.headers) == 1 + + +@ensureDeferred +async def test_scrapy_poet_downloader_ignored_request(): + """It should handle IgnoreRequest from Scrapy according to the web poet + standard on additional request error handling.""" + req = web_poet.HttpRequest("https://example.com") + + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + mock_dtf.side_effect = scrapy.exceptions.IgnoreRequest + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_downloader = create_scrapy_downloader(mock_downloader) + + with pytest.raises(web_poet.exceptions.HttpError): + await scrapy_downloader(req) + + +@ensureDeferred +async def test_scrapy_poet_downloader_twisted_error(): + req = web_poet.HttpRequest("https://example.com") + + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + mock_dtf.side_effect = twisted.internet.error.TimeoutError + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_downloader = create_scrapy_downloader(mock_downloader) + + with pytest.raises(web_poet.exceptions.HttpRequestError): + await scrapy_downloader(req) + + +@ensureDeferred +async def test_scrapy_poet_downloader_head_redirect(fake_http_response): + req = web_poet.HttpRequest("https://example.com", method="HEAD") + + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + mock_dtf.return_value = fake_http_response + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_downloader = create_scrapy_downloader(mock_downloader) + + await scrapy_downloader(req) + + args, kwargs = mock_downloader.call_args + scrapy_request = args[0] + assert scrapy_request.meta.get("dont_redirect") is True + + +class LeafResource(Resource): + isLeaf = True + + def deferRequest(self, request, delay, f, *a, **kw): + def _cancelrequest(_): + # silence CancelledError + d.addErrback(lambda _: None) + d.cancel() + + d = deferLater(reactor, delay, f, *a, **kw) + request.notifyFinish().addErrback(_cancelrequest) + return d + + +class EchoResource(LeafResource): + def render_GET(self, request): + return request.content.read() + + +@inlineCallbacks +def test_additional_requests_success(): + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + response = await self.http_client.request( + server.root_url, + body=b'bar', + ) + return {'foo': response.body.decode()} + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'foo': 'bar'}] + + +class StatusResource(LeafResource): + def render_GET(self, request): + decoded_body = request.content.read().decode() + if decoded_body: + request.setResponseCode(int(decoded_body)) + return b"" + + +@inlineCallbacks +def test_additional_requests_bad_response(): + items = [] + + with MockServer(StatusResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b'400', + ) + except HttpResponseError: + return {'foo': 'bar'} + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'foo': 'bar'}] + + +class DelayedResource(LeafResource): + def render_GET(self, request): + decoded_body = request.content.read().decode() + seconds = float(decoded_body) if decoded_body else 0 + self.deferRequest( + request, + seconds, + self._delayedRender, + request, + seconds, + ) + return NOT_DONE_YET + + def _delayedRender(self, request, seconds): + request.finish() + + +@inlineCallbacks +def test_additional_requests_connection_issue(): + items = [] + + with mock.patch('scrapy_poet.downloader.http_request_to_scrapy_request') \ + as mock_http_request_to_scrapy_request: + mock_http_request_to_scrapy_request.side_effect = partial( + http_request_to_scrapy_request, + meta={'download_timeout': 0.001}, + ) + + with MockServer(DelayedResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b"0.002", + ) + except HttpRequestError: + return {'foo': 'bar'} + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'foo': 'bar'}] + + +@inlineCallbacks +def test_additional_requests_ignored_request(): + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b'ignore', + ) + except HttpError as e: + return {'exc': e.__class__} + + class TestDownloaderMiddleware: + def process_response(self, request, response, spider): + if b'ignore' in response.body: + raise IgnoreRequest + return response + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + TestDownloaderMiddleware: 1, + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'exc': HttpError}] + + +@pytest.mark.xfail( + reason=( + "Currently, we do not make a distinction between exceptions raised " + "from the downloader or from a downloader middleware, except for " + "IgnoreRequest. In the future, we might want to inspect the stack to " + "determine the source of an exception and raise HttpError instead of " + "HttpRequestError when the exception originates in a downloader " + "middleware." + ), + strict=True, +) +@inlineCallbacks +def test_additional_requests_unhandled_downloader_middleware_exception(): + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b'raise', + ) + except HttpError as e: + return {'exc': e.__class__} + + class TestDownloaderMiddleware: + def process_response(self, request, response, spider): + if b'raise' in response.body: + raise RuntimeError + return response + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + TestDownloaderMiddleware: 1, + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'exc': HttpError}] + + +@inlineCallbacks +def test_additional_requests_dont_filter(): + """Verify that while duplicate regular requests are filtered out, + additional requests are not (neither relative to the main requests not + relative to each other). + + In Scrapy, request de-duplication is implemented on the scheduler, and + because additional requests do not go through the scheduler, this works as + expected. + """ + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + response1 = await self.http_client.request( + server.root_url, + body=b'a', + ) + response2 = await self.http_client.request( + server.root_url, + body=b'a', + ) + return {response1.body.decode(): response2.body.decode()} + + class TestSpider(Spider): + name = 'test_spider' + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + def start_requests(self): + yield Request(server.root_url, body=b'a') + yield Request(server.root_url, body=b'a') + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'a': 'a'}] diff --git a/tests/test_injection.py b/tests/test_injection.py index 99393e52..6871b1b6 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -266,11 +266,11 @@ def callback(response: DummyResponse, class Html(Injectable): url = "http://example.com" - html = """Price: 22€""" + text = """Price: 22€""" @property def selector(self): - return parsel.Selector(self.html) + return parsel.Selector(self.text) class EurDollarRate(Injectable): diff --git a/tests/test_providers.py b/tests/test_providers.py index f05030ea..5907810a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,8 +1,10 @@ from typing import Any, List, Set, Callable, Sequence +from unittest import mock import attr import json -from pytest_twisted import inlineCallbacks +import pytest +from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy_poet import HttpResponseProvider from twisted.python.failure import Failure @@ -11,9 +13,14 @@ from scrapy.crawler import Crawler from scrapy.settings import Settings from scrapy.utils.test import get_crawler -from scrapy_poet.page_input_providers import CacheDataProviderMixin, PageObjectInputProvider -from tests.utils import crawl_single_item, HtmlResource -from web_poet import HttpResponse +from scrapy_poet.page_input_providers import ( + CacheDataProviderMixin, + HttpClientProvider, + PageObjectInputProvider, + PageParamsProvider, +) +from tests.utils import crawl_single_item, HtmlResource, AsyncMock +from web_poet import HttpResponse, HttpClient class ProductHtml(HtmlResource): @@ -29,6 +36,7 @@ class ProductHtml(HtmlResource): """ + class NonProductHtml(HtmlResource): html = """ @@ -61,7 +69,9 @@ def __init__(self, crawler: Crawler): assert isinstance(crawler, Crawler) super().__init__(crawler) - def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider): + def __call__( + self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider + ): assert isinstance(spider, scrapy.Spider) ret: List[Any] = [] if Price in to_provide: @@ -152,8 +162,9 @@ class NameFirstMultiProviderSpider(PriceFirstMultiProviderSpider): def test_name_first_spider(settings, tmp_path): cache = tmp_path / "cache.sqlite3" settings.set("SCRAPY_POET_CACHE", str(cache)) - item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, ProductHtml, - settings) + item, _, _ = yield crawl_single_item( + NameFirstMultiProviderSpider, ProductHtml, settings + ) assert cache.exists() assert item == { Price: Price("22€"), @@ -164,8 +175,9 @@ def test_name_first_spider(settings, tmp_path): # Let's see that the cache is working. We use a different and wrong resource, # but it should be ignored by the cached version used - item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, NonProductHtml, - settings) + item, _, _ = yield crawl_single_item( + NameFirstMultiProviderSpider, NonProductHtml, settings + ) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -176,8 +188,9 @@ def test_name_first_spider(settings, tmp_path): @inlineCallbacks def test_price_first_spider(settings): - item, _, _ = yield crawl_single_item(PriceFirstMultiProviderSpider, ProductHtml, - settings) + item, _, _ = yield crawl_single_item( + PriceFirstMultiProviderSpider, ProductHtml, settings + ) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -194,3 +207,40 @@ def test_response_data_provider_fingerprint(settings): # The fingerprint should be readable since it's JSON-encoded. fp = rdp.fingerprint(scrapy.http.Response, request) assert json.loads(fp) + + +@ensureDeferred +async def test_http_client_provider(settings): + crawler = get_crawler(Spider, settings) + crawler.engine = AsyncMock() + + with mock.patch( + "scrapy_poet.page_input_providers.create_scrapy_downloader" + ) as mock_factory: + provider = HttpClientProvider(crawler) + results = provider(set(), crawler) + assert isinstance(results[0], HttpClient) + + results[0]._request_downloader == mock_factory.return_value + +def test_page_params_provider(settings): + crawler = get_crawler(Spider, settings) + provider = PageParamsProvider(crawler) + request = scrapy.http.Request("https://example.com") + + results = provider(set(), request) + + assert results[0] == {} + + expected_data = {"key": "value"} + request.meta.update({"page_params": expected_data}) + results = provider(set(), request) + + assert results[0] == expected_data + + # Check that keys that are invalid Python variable names work. + expected_data = {1: "a"} + request.meta.update({"page_params": expected_data}) + results = provider(set(), request) + + assert results[0] == expected_data diff --git a/tests/test_utils.py b/tests/test_utils.py index 05e55542..3083579b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,15 @@ from unittest import mock from pathlib import PosixPath -from scrapy_poet.utils import get_scrapy_data_path +import pytest +from scrapy.http import Request, Response, TextResponse +from web_poet import HttpRequest, HttpResponse + +from scrapy_poet.utils import ( + get_scrapy_data_path, + http_request_to_scrapy_request, + scrapy_response_to_http_response, +) @mock.patch("scrapy_poet.utils.os.makedirs") @@ -19,3 +27,144 @@ def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path): mock_makedirs.assert_called_once() mock_makedirs.assert_called_with(path, exist_ok=True) + + +@pytest.mark.parametrize( + "http_request,kwargs,scrapy_request", + ( + ( + HttpRequest("https://example.com"), + {}, + Request("https://example.com"), + ), + ( + HttpRequest("https://example.com"), + {"dont_filter": True}, + Request("https://example.com", dont_filter=True), + ), + ( + HttpRequest("https://example.com", method="POST"), + {}, + Request("https://example.com", method="POST"), + ), + ( + HttpRequest("https://example.com", headers={"a": "b"}), + {}, + Request("https://example.com", headers={"a": "b"}), + ), + ( + HttpRequest("https://example.com", headers={"a": "b"}), + {}, + Request("https://example.com", headers=(("a", "b"),)), + ), + ( + HttpRequest("https://example.com", headers=(("a", "b"),)), + {}, + Request("https://example.com", headers=(("a", "b"),)), + ), + ( + HttpRequest( + "https://example.com", + headers=(("a", "b"), ("a", "c")), + ), + {}, + Request( + "https://example.com", + headers=(("a", "b"), ("a", "c")), + ), + ), + ( + HttpRequest("https://example.com", body=b"a"), + {}, + Request("https://example.com", body=b"a"), + ), + ), +) +def test_http_request_to_scrapy_request(http_request, kwargs, scrapy_request): + result = http_request_to_scrapy_request(http_request, **kwargs) + assert result.url == scrapy_request.url + assert result.method == scrapy_request.method + assert result.headers == scrapy_request.headers + assert result.body == scrapy_request.body + assert result.meta == scrapy_request.meta + assert result.cb_kwargs == scrapy_request.cb_kwargs + assert result.cookies == scrapy_request.cookies + assert result.encoding == scrapy_request.encoding + assert result.priority == scrapy_request.priority + assert result.dont_filter == scrapy_request.dont_filter + assert result.callback == scrapy_request.callback + assert result.errback == scrapy_request.errback + assert result.flags == scrapy_request.flags + + +@pytest.mark.parametrize( + "scrapy_response,http_response", + ( + ( + Response("https://example.com"), + HttpResponse("https://example.com", body=b"", status=200), + ), + ( + Response("https://example.com", body=b"a"), + HttpResponse("https://example.com", body=b"a", status=200), + ), + ( + Response("https://example.com", status=429), + HttpResponse("https://example.com", body=b"", status=429), + ), + ( + Response("https://example.com", headers={"a": "b"}), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers={"a": "b"}, + ), + ), + ( + Response("https://example.com", headers={"a": "b"}), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers=(("a", "b"),), + ), + ), + ( + Response("https://example.com", headers=(("a", "b"),)), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers=(("a", "b"),), + ), + ), + pytest.param( + Response("https://example.com", headers=(("a", "b"), ("a", "c"))), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers=(("a", "b"), ("a", "c")), + ), + marks=pytest.mark.xfail( + reason="https://github.com/scrapy/scrapy/issues/5515", + ), + ), + ( + TextResponse("https://example.com", body="a", encoding="ascii"), + HttpResponse("https://example.com", body=b"a", status=200, encoding="ascii"), + ), + ( + TextResponse("https://example.com", body="a", encoding="utf-8"), + HttpResponse("https://example.com", body=b"a", status=200, encoding="utf-8"), + ), + ), +) +def test_scrapy_response_to_http_response(scrapy_response, http_response): + result = scrapy_response_to_http_response(scrapy_response) + assert str(result.url) == str(http_response.url) + assert result.body == http_response.body + assert result.status == http_response.status + assert result.headers == http_response.headers + assert result._encoding == http_response._encoding diff --git a/tests/utils.py b/tests/utils.py index 55b26f5b..05186b5b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ from typing import Dict +from unittest import mock from pytest_twisted import inlineCallbacks from scrapy.exceptions import CloseSpider @@ -84,3 +85,10 @@ def parse(*args, **kwargs): # Mimic type annotations parse.__annotations__ = callback.__annotations__ return parse + + +class AsyncMock(mock.MagicMock): + """workaround since python 3.7 doesn't ship with asyncmock.""" + + async def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs) diff --git a/tox.ini b/tox.ini index 31b9954e..62cb3fd7 100644 --- a/tox.ini +++ b/tox.ini @@ -1,13 +1,11 @@ [tox] -envlist = py37,py38,py39,py310,mypy,docs +envlist = min,py37,py38,py39,py310,asyncio,asyncio-min,mypy,docs [testenv] deps = pytest pytest-cov - scrapy >= 2.1.0 pytest-twisted - web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet commands = py.test \ @@ -15,6 +13,29 @@ commands = --doctest-modules \ {posargs:scrapy_poet tests} +[testenv:asyncio] +commands = + {[testenv]commands} --reactor=asyncio + +[testenv:min] +basepython = python3.7 +deps = + {[testenv]deps} + andi==0.4.1 + attrs==21.3.0 + parsel==1.5.0 + scrapy==2.6.0 + sqlitedict==1.5.0 + url-matcher==0.2.0 + web-poet==0.2.0 + +[testenv:asyncio-min] +basepython = python3.7 +commands = + {[testenv:asyncio]commands} +deps = + {[testenv:min]deps} + [testenv:mypy] deps = mypy==0.790