diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 40e11ea3..53e88092 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,11 @@ Changelog ========= +TBR +--- + +* Use the new ``web_poet.HttpResponse`` which replaces ``web_poet.ResponseData``. + 0.3.0 (2022-01-28) ------------------ diff --git a/docs/overrides.rst b/docs/overrides.rst index 90d823b6..9e0907d7 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -34,7 +34,7 @@ using the following Page Object: class ISBNBookPage(ItemWebPage): - def __init__(self, response: ResponseData, book_page: BookPage): + def __init__(self, response: HttpResponse, book_page: BookPage): super().__init__(response) self.book_page = book_page diff --git a/docs/providers.rst b/docs/providers.rst index fd8bf0a7..5ef0d7fe 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -20,34 +20,34 @@ Creating providers ================== Providers are responsible for building dependencies needed by Injectable -objects. A good example would be the ``ResponseDataProvider``, -which builds and provides a ``ResponseData`` instance for Injectables -that need it, like the ``ItemWebPage``. +objects. A good example would be the ``HttpResponseProvider``, +which builds and provides a ``web_poet.HttpResponse`` instance for Injectables +that need it, like the ``web_poet.ItemWebPage``. .. code-block:: python import attr from typing import Set, Callable + import web_poet from scrapy_poet.page_input_providers import PageObjectInputProvider from scrapy import Response - @attr.define - class ResponseData: - """Represents a response containing its URL and HTML content.""" - url: str - html: str - - - class ResponseDataProvider(PageObjectInputProvider): - """This class provides ``web_poet.page_inputs.ResponseData`` instances.""" - provided_classes = {ResponseData} + class HttpResponseProvider(PageObjectInputProvider): + """This class provides ``web_poet.HttpResponse`` instances.""" + provided_classes = {web_poet.HttpResponse} def __call__(self, to_provide: Set[Callable], response: Response): - """Build a ``ResponseData`` instance using a Scrapy ``Response``""" - return [ResponseData(url=response.url, html=response.text)] - + """Build a ``web_poet.HttpResponse`` instance using a Scrapy ``Response``""" + return [ + HttpResponse( + url=response.url, + body=response.body, + status=response.status, + headers=web_poet.HttpResponseHeaders.from_bytes_dict(response.headers), + ) + ] You can implement your own providers in order to extend or override current ``scrapy-poet`` behavior. All providers should inherit from this base class: @@ -61,7 +61,7 @@ Cache Suppport in Providers =========================== ``scrapy-poet`` also supports caching of the provided dependencies from the -providers. For example, :class:`~.ResponseDataProvider` supports this right off +providers. For example, :class:`~.HttpResponseProvider` supports this right off the bat. It's able to do this by inheriting the :class:`~.CacheDataProviderMixin` and implementing all of its ``abstractmethods``. @@ -70,18 +70,26 @@ would lead to the following code: .. code-block:: python + import web_poet from scrapy_poet.page_input_providers import ( CacheDataProviderMixin, PageObjectInputProvider, ) - class ResponseDataProvider(PageObjectInputProvider, CacheDataProviderMixin): - """This class provides ``web_poet.page_inputs.ResponseData`` instances.""" - provided_classes = {ResponseData} + class HttpResponseProvider(PageObjectInputProvider, CacheDataProviderMixin): + """This class provides ``web_poet.HttpResponse`` instances.""" + provided_classes = {web_poet.HttpResponse} def __call__(self, to_provide: Set[Callable], response: Response): - """Build a ``ResponseData`` instance using a Scrapy ``Response``""" - return [ResponseData(url=response.url, html=response.text)] + """Build a ``web_poet.HttpResponse`` instance using a Scrapy ``Response``""" + return [ + web_poet.HttpResponse( + url=response.url, + body=response.body, + status=response.status, + headers=web_poet.HttpResponseHeaders.from_bytes_dict(response.headers), + ) + ] def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: """Returns a fingerprint to identify the specific request.""" @@ -136,7 +144,7 @@ configuration dictionaries for more information. .. note:: The providers in :const:`scrapy_poet.DEFAULT_PROVIDERS`, - which includes a provider for :class:`~ResponseData`, are always + which includes a provider for :class:`~HttpResponse`, are always included by default. You can disable any of them by listing it in the configuration with the priority `None`. @@ -264,8 +272,8 @@ Page Object uses it, the request is not ignored, for example: The code above is just for example purposes. If you need to use ``Response`` instances in your Page Objects, use built-in ``ItemWebPage`` - it has - ``response`` attribute with ``ResponseData``; no additional configuration - is needed, as there is ``ResponseDataProvider`` enabled in ``scrapy-poet`` + ``response`` attribute with ``HttpResponse``; no additional configuration + is needed, as there is ``HttpResponseProvider`` enabled in ``scrapy-poet`` by default. Requests concurrency diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 1cf55d0d..973ef9d4 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -3,5 +3,5 @@ from .page_input_providers import ( PageObjectInputProvider, CacheDataProviderMixin, - ResponseDataProvider, + HttpResponseProvider, ) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 96c628bd..afc631f1 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -13,7 +13,7 @@ from scrapy.utils.misc import create_instance, load_object from .api import DummyResponse from .overrides import PerDomainOverridesRegistry -from .page_input_providers import ResponseDataProvider +from .page_input_providers import HttpResponseProvider from .injection import Injector @@ -21,7 +21,7 @@ DEFAULT_PROVIDERS = { - ResponseDataProvider: 500 + HttpResponseProvider: 500 } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 522ddf51..dd40f952 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -3,7 +3,7 @@ have created a repository of ``PageObjectInputProviders``. The current module implements a ``PageObjectInputProviders`` for -:class:`web_poet.page_inputs.ResponseData`, which is in charge of providing the response +:class:`web_poet.page_inputs.HttpResponse`, which is in charge of providing the response HTML from Scrapy. You could also implement different providers in order to acquire data from multiple external sources, for example, Splash or Auto Extract API. @@ -20,7 +20,7 @@ from scrapy.utils.request import request_fingerprint from scrapy_poet.injection_errors import MalformedProvidedClassesError -from web_poet import ResponseData +from web_poet import HttpResponse, HttpResponseHeaders class PageObjectInputProvider: @@ -154,15 +154,22 @@ def has_cache_support(self): return True -class ResponseDataProvider(PageObjectInputProvider, CacheDataProviderMixin): - """This class provides ``web_poet.page_inputs.ResponseData`` instances.""" +class HttpResponseProvider(PageObjectInputProvider, CacheDataProviderMixin): + """This class provides ``web_poet.page_inputs.HttpResponse`` instances.""" - provided_classes = {ResponseData} + provided_classes = {HttpResponse} name = "response_data" def __call__(self, to_provide: Set[Callable], response: Response): - """Builds a ``ResponseData`` instance using a Scrapy ``Response``""" - return [ResponseData(url=response.url, html=response.text)] + """Builds a ``HttpResponse`` instance using a Scrapy ``Response``""" + return [ + HttpResponse( + url=response.url, + body=response.body, + status=response.status, + headers=HttpResponseHeaders.from_bytes_dict(response.headers), + ) + ] def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: request_keys = {"url", "method", "body"} @@ -181,4 +188,13 @@ def serialize(self, result: Sequence[Any]) -> Any: return [attr.asdict(response_data) for response_data in result] def deserialize(self, data: Any) -> Sequence[Any]: - return [ResponseData(**response_data) for response_data in data] + return [ + HttpResponse( + response_data["url"], + response_data["body"], + status=response_data["status"], + headers=HttpResponseHeaders.from_bytes_dict(response_data["headers"]), + encoding=response_data["_encoding"], + ) + for response_data in data + ] diff --git a/setup.py b/setup.py index e71e75a7..6cc9740a 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'andi >= 0.4.1', 'attrs', 'parsel', - 'web-poet', + 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet', 'tldextract', 'sqlitedict', ], diff --git a/tests/conftest.py b/tests/conftest.py index 6082152f..78538fc2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,7 @@ import pytest from scrapy.settings import Settings -from scrapy_poet.page_input_providers import ResponseDataProvider +from scrapy_poet.page_input_providers import HttpResponseProvider @pytest.fixture() diff --git a/tests/test_injection.py b/tests/test_injection.py index 066b466c..81f0aabf 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -5,11 +5,12 @@ from pytest_twisted import inlineCallbacks import weakref +import parsel from scrapy import Request from scrapy.http import Response from scrapy_poet.utils import get_domain -from scrapy_poet import CacheDataProviderMixin, ResponseDataProvider, PageObjectInputProvider, \ +from scrapy_poet import CacheDataProviderMixin, HttpResponseProvider, PageObjectInputProvider, \ DummyResponse from scrapy_poet.injection import check_all_providers_are_callable, is_class_provided_by_any_provider_fn, \ get_injector_for_testing, get_response_for_testing @@ -264,6 +265,10 @@ class Html(Injectable): url = "http://example.com" html = """Price: 22€""" + @property + def selector(self): + return parsel.Selector(self.html) + class EurDollarRate(Injectable): rate = 1.1 @@ -332,17 +337,17 @@ def callback(response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate) def test_load_provider_classes(): - provider_as_string = f"{ResponseDataProvider.__module__}.{ResponseDataProvider.__name__}" - injector = get_injector_for_testing({provider_as_string: 2, ResponseDataProvider: 1}) - assert all(type(prov) == ResponseDataProvider for prov in injector.providers) + provider_as_string = f"{HttpResponseProvider.__module__}.{HttpResponseProvider.__name__}" + injector = get_injector_for_testing({provider_as_string: 2, HttpResponseProvider: 1}) + assert all(type(prov) == HttpResponseProvider for prov in injector.providers) assert len(injector.providers) == 2 def test_check_all_providers_are_callable(): - check_all_providers_are_callable([ResponseDataProvider(None)]) + check_all_providers_are_callable([HttpResponseProvider(None)]) with pytest.raises(NonCallableProviderError) as exinf: check_all_providers_are_callable([PageObjectInputProvider(None), - ResponseDataProvider(None)]) + HttpResponseProvider(None)]) assert "PageObjectInputProvider" in str(exinf.value) assert "not callable" in str(exinf.value) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 5daeb57b..f4020af0 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -22,7 +22,7 @@ from scrapy_poet.page_input_providers import ( PageObjectInputProvider ) -from web_poet.page_inputs import ResponseData +from web_poet.page_inputs import HttpResponse from scrapy_poet import DummyResponse from tests.utils import (HtmlResource, crawl_items, @@ -125,10 +125,10 @@ class OptionalAndUnionPage(ItemWebPage): breadcrumbs: BreadcrumbsExtraction opt_check_1: Optional[BreadcrumbsExtraction] opt_check_2: Optional[str] # str is not Injectable, so None expected here - union_check_1: Union[BreadcrumbsExtraction, ResponseData] # Breadcrumbs is injected - union_check_2: Union[str, ResponseData] # ResponseData is injected - union_check_3: Union[Optional[str], ResponseData] # None is injected - union_check_4: Union[None, str, ResponseData] # None is injected + union_check_1: Union[BreadcrumbsExtraction, HttpResponse] # Breadcrumbs is injected + union_check_2: Union[str, HttpResponse] # HttpResponse is injected + union_check_3: Union[Optional[str], HttpResponse] # None is injected + union_check_4: Union[None, str, HttpResponse] # None is injected union_check_5: Union[BreadcrumbsExtraction, None, str] # Breadcrumbs is injected def to_item(self): @@ -151,7 +151,7 @@ def test_optional_and_unions(settings): @attr.s(auto_attribs=True) class ProvidedWithDeferred: msg: str - response: ResponseData # it should be None because this class is provided + response: HttpResponse # it should be None because this class is provided @attr.s(auto_attribs=True) @@ -198,7 +198,7 @@ def __call__(self, to_provide): # we're returning a class that's not listed in self.provided_classes return { ExtraClassData: ExtraClassData("this should be returned"), - ResponseData: ResponseData("example.com", "this shouldn't"), + HttpResponse: HttpResponse("example.com", b"this shouldn't"), } diff --git a/tests/test_providers.py b/tests/test_providers.py index 02ec71c8..f05030ea 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -3,7 +3,7 @@ import attr import json from pytest_twisted import inlineCallbacks -from scrapy_poet import ResponseDataProvider +from scrapy_poet import HttpResponseProvider from twisted.python.failure import Failure import scrapy @@ -13,7 +13,7 @@ from scrapy.utils.test import get_crawler from scrapy_poet.page_input_providers import CacheDataProviderMixin, PageObjectInputProvider from tests.utils import crawl_single_item, HtmlResource -from web_poet import ResponseData +from web_poet import HttpResponse class ProductHtml(HtmlResource): @@ -104,7 +104,7 @@ def deserialize(self, data: Any) -> Sequence[Any]: return data -class ResponseDataProviderForTest(ResponseDataProvider): +class HttpResponseProviderForTest(HttpResponseProvider): """Uses a fixed fingerprint because the test server is always changing the URL from test to test""" def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: @@ -116,7 +116,7 @@ class PriceFirstMultiProviderSpider(scrapy.Spider): url = None custom_settings = { "SCRAPY_POET_PROVIDERS": { - ResponseDataProviderForTest: 0, + HttpResponseProviderForTest: 0, PriceHtmlDataProvider: 1, NameHtmlDataProvider: 2, } @@ -128,12 +128,12 @@ def start_requests(self): def errback(self, failure: Failure): yield {"exception": failure.value} - def parse(self, response, price: Price, name: Name, html: Html, response_data: ResponseData): + def parse(self, response, price: Price, name: Name, html: Html, response_data: HttpResponse): yield { Price: price, Name: name, Html: html, - "response_data_html": response_data.html, + "response_data_text": response_data.text, } @@ -141,7 +141,7 @@ class NameFirstMultiProviderSpider(PriceFirstMultiProviderSpider): custom_settings = { "SCRAPY_POET_PROVIDERS": { - ResponseDataProviderForTest: 0, + HttpResponseProviderForTest: 0, NameHtmlDataProvider: 1, PriceHtmlDataProvider: 2, } @@ -159,7 +159,7 @@ def test_name_first_spider(settings, tmp_path): Price: Price("22€"), Name: Name("Chocolate"), Html: Html("Name Html!"), - "response_data_html": ProductHtml.html, + "response_data_text": ProductHtml.html, } # Let's see that the cache is working. We use a different and wrong resource, @@ -170,11 +170,10 @@ def test_name_first_spider(settings, tmp_path): Price: Price("22€"), Name: Name("Chocolate"), Html: Html("Name Html!"), - "response_data_html": ProductHtml.html, + "response_data_text": ProductHtml.html, } - @inlineCallbacks def test_price_first_spider(settings): item, _, _ = yield crawl_single_item(PriceFirstMultiProviderSpider, ProductHtml, @@ -183,13 +182,13 @@ def test_price_first_spider(settings): Price: Price("22€"), Name: Name("Chocolate"), Html: Html("Price Html!"), - "response_data_html": ProductHtml.html, + "response_data_text": ProductHtml.html, } def test_response_data_provider_fingerprint(settings): crawler = get_crawler(Spider, settings) - rdp = ResponseDataProvider(crawler) + rdp = HttpResponseProvider(crawler) request = scrapy.http.Request("https://example.com") # The fingerprint should be readable since it's JSON-encoded. diff --git a/tests/test_response_required_logic.py b/tests/test_response_required_logic.py index aea45b84..846d8099 100644 --- a/tests/test_response_required_logic.py +++ b/tests/test_response_required_logic.py @@ -12,7 +12,7 @@ from scrapy_poet.page_input_providers import ( PageObjectInputProvider, - ResponseDataProvider, + HttpResponseProvider, ) from web_poet import ItemPage, WebPage @@ -62,7 +62,7 @@ def __call__(self, to_provide): return [FakeProductResponse(data=data)] -class TextProductProvider(ResponseDataProvider): +class TextProductProvider(HttpResponseProvider): # This is wrong. You should not annotate provider dependencies with classes # like TextResponse or HtmlResponse, you should use Response instead. @@ -70,7 +70,7 @@ def __call__(self, to_provide, response: TextResponse): return super().__call__(to_provide, response) -class StringProductProvider(ResponseDataProvider): +class StringProductProvider(HttpResponseProvider): def __call__(self, to_provide, response: str): return super().__call__(to_provide, response) @@ -115,7 +115,7 @@ class MySpider(scrapy.Spider): name = 'foo' custom_settings = { "SCRAPY_POET_PROVIDERS": { - ResponseDataProvider: 1, + HttpResponseProvider: 1, DummyProductProvider: 2, FakeProductProvider: 3, } @@ -177,7 +177,7 @@ def cb(response): def test_is_provider_using_response(): assert is_provider_requiring_scrapy_response(PageObjectInputProvider) is False - assert is_provider_requiring_scrapy_response(ResponseDataProvider) is True + assert is_provider_requiring_scrapy_response(HttpResponseProvider) is True # TextProductProvider wrongly annotates response dependency as # TextResponse, instead of using the Response type. assert is_provider_requiring_scrapy_response(TextProductProvider) is False diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index 4f60f99d..3ccd34c9 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -8,7 +8,7 @@ from scrapy_poet.injection import SCRAPY_PROVIDED_CLASSES from scrapy_poet.page_input_providers import ( PageObjectInputProvider, - ResponseDataProvider, + HttpResponseProvider, ) from tests.utils import crawl_items, crawl_single_item, HtmlResource @@ -19,7 +19,7 @@ class ProductHtml(HtmlResource): html = """

Chocolate

@@ -61,7 +61,7 @@ class MySpider(Spider): url = None custom_settings = { "SCRAPY_POET_PROVIDERS": { - ResponseDataProvider: 1, + HttpResponseProvider: 1, PageDataProvider: 2, } } diff --git a/tox.ini b/tox.ini index 1eca5c06..31b9954e 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ deps = pytest-cov scrapy >= 2.1.0 pytest-twisted - web-poet + web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet commands = py.test \