From 5eca70e020a9cf0c6fdddceaf8c3f9fa9628eb47 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 15 Mar 2022 15:08:01 +0800 Subject: [PATCH] add docs for supporting web-poet's HttpClient and Meta --- README.rst | 24 +++ docs/index.rst | 3 +- docs/intro/advanced-tutorial.rst | 167 ++++++++++++++++++ .../{tutorial.rst => basic-tutorial.rst} | 8 +- docs/intro/install.rst | 2 +- docs/requirements.txt | 2 +- scrapy_poet/debug.log | 1 + scrapy_poet/page_input_providers.py | 4 +- setup.py | 2 +- tox.ini | 3 +- 10 files changed, 204 insertions(+), 12 deletions(-) create mode 100644 docs/intro/advanced-tutorial.rst rename docs/intro/{tutorial.rst => basic-tutorial.rst} (99%) create mode 100644 scrapy_poet/debug.log diff --git a/README.rst b/README.rst index 739f51ab..13b7e1ab 100644 --- a/README.rst +++ b/README.rst @@ -36,3 +36,27 @@ License is BSD 3-clause. * Issue tracker: https://github.com/scrapinghub/scrapy-poet/issues .. _`web-poet`: https://github.com/scrapinghub/web-poet + + +Quick Start +*********** + +Installation +============ + +.. code-block:: + + pip install scrapy-poet + +Requires **Python 3.7+** and **Scrapy >= 2.6.0**. + +Usage in a Scrapy Project +========================= + +Add the following inside Scrapy's ``settings.py`` file: + +.. code-block:: python + + DOWNLOADER_MIDDLEWARES = { + "scrapy_poet.InjectionMiddleware": 543, + } diff --git a/docs/index.rst b/docs/index.rst index 30c17ff4..1271bbe5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,7 +35,8 @@ To get started, see :ref:`intro-install` and :ref:`intro-tutorial`. :maxdepth: 1 intro/install - intro/tutorial + intro/basic-tutorial + intro/advanced-tutorial .. toctree:: :caption: Advanced diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst new file mode 100644 index 00000000..a87d4bbc --- /dev/null +++ b/docs/intro/advanced-tutorial.rst @@ -0,0 +1,167 @@ +.. _`intro-advanced-tutorial`: + +================= +Advanced Tutorial +================= + +This section intends to go over the supported features in **web-poet** by +**scrapy-poet**: + + * ``web_poet.HttpClient`` + * ``web_poet.Meta`` + +These are mainly achieved by **scrapy-poet** implementing **providers** for them: + + * :class:`scrapy_poet.page_input_providers.HttpClientProvider` + * :class:`scrapy_poet.page_input_providers.MetaProvider` + + +Additional Requests +=================== + +Using Page Objects using additional requests doesn't need anything special from +the spider. It would work as-is because of the readily available +:class:`scrapy_poet.page_input_providers.HttpClientProvider` that is enabled +out of the box. + +This supplies the Page Object with the necessary ``web_poet.HttpClient`` instance. +Take note the HTTP Downloader implementation that **scrapy-poet** provides to +``web_poet.HttpClient`` would be the **Scrapy Downloader**. + +.. tip:: + + This means that the additional requests inside a Page Object will have access + to the **Downloader Middlewares** that the Spider is using. + + +Suppose we have the following Page Object: + +.. code-block:: python + + import attr + import web_poet + + + @attr.define + class ProductPage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + + async def to_item(self): + item = { + "url": self.url, + "name": self.css("#main h3.name ::text").get(), + "product_id": self.css("#product ::attr(product-id)").get(), + } + + # Simulates clicking on a button that says "View All Images" + response: web_poet.ResponseData = await self.http_client.get( + f"https://api.example.com/v2/images?id={item['product_id']}" + ) + page = web_poet.WebPage(response) + item["images"] = page.css(".product-images img::attr(src)").getall() + return item + + +It can be directly used inside the spider as: + +.. code-block:: python + + import scrapy + + + def ProductSpider(scrapy.Spider): + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + } + } + + start_urls = [ + "https://example.com/category/product/item?id=123", + "https://example.com/category/product/item?id=989", + ] + + async def parse(self, response, page: ProductPage): + return await page.to_item() + +Note that we needed to update the ``parse()`` method to be an ``async`` method, +since the ``to_item()`` method of the Page Object we're using is an ``async`` +method as well. + +This is also the primary reason why **scrapy-poet** requires ``scrapy>=2.6.0`` +since it's the minimum version that has full :mod:`asyncio` support. + + +Meta +==== + +Using ``web_poet.Meta`` allows the Scrapy spider to pass any arbitrary information +into the Page Object. + +Suppose we update the earlier Page Object to control the additional request. +This basically acts as a switch to update the behavior of the Page Object: + +.. code-block:: python + + import attr + import web_poet + + + @attr.define + class ProductPage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + meta: web_poet.Meta + + async def to_item(self): + item = { + "url": self.url, + "name": self.css("#main h3.name ::text").get(), + "product_id": self.css("#product ::attr(product-id)").get(), + } + + # Simulates clicking on a button that says "View All Images" + if self.meta.get("enable_extracting_all_images") + response: web_poet.ResponseData = await self.http_client.get( + f"https://api.example.com/v2/images?id={item['product_id']}" + ) + page = web_poet.WebPage(response) + item["images"] = page.css(".product-images img::attr(src)").getall() + + return item + +Passing the ``enable_extracting_all_images`` meta value from the spider into +the Page Object can be achieved by using **Scrapy's** ``Request.meta`` attribute. +Specifically, any ``dict`` value inside the ``po_args`` parameter inside +**Scrapy's** ``Request.meta`` will be passed into ``web_poet.Meta``. + +Let's see it in action: + +.. code-block:: python + + import scrapy + + + def ProductSpider(scrapy.Spider): + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + } + } + + start_urls = [ + "https://example.com/category/product/item?id=123", + "https://example.com/category/product/item?id=989", + ] + + def start_requests(self): + for url in start_urls: + yield scrapy.Request( + url=url, + callback=self.parse, + meta={"po_args": {"enable_extracting_all_images": True}} + ) + + async def parse(self, response, page: ProductPage): + return await page.to_item() diff --git a/docs/intro/tutorial.rst b/docs/intro/basic-tutorial.rst similarity index 99% rename from docs/intro/tutorial.rst rename to docs/intro/basic-tutorial.rst index b11f76b5..9ee1fb08 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -1,8 +1,8 @@ -.. _`intro-tutorial`: +.. _`intro-basic-tutorial`: -======== -Tutorial -======== +============== +Basic Tutorial +============== In this tutorial, we’ll assume that ``scrapy-poet`` is already installed on your system. If that’s not the case, see :ref:`intro-install`. diff --git a/docs/intro/install.rst b/docs/intro/install.rst index 8dc87299..9d3018e9 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -16,7 +16,7 @@ If you’re already familiar with installation of Python packages, you can insta pip install scrapy-poet -Scrapy 2.1.0 or above is required and it has to be installed separately. +Scrapy 2.6.0 or above is required and it has to be installed separately. Things that are good to know ============================ diff --git a/docs/requirements.txt b/docs/requirements.txt index e6337937..99443b22 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ -Scrapy >= 2.1.0 +Scrapy >= 2.6.0 Sphinx >= 3.0.3 sphinx-rtd-theme >= 0.4 diff --git a/scrapy_poet/debug.log b/scrapy_poet/debug.log new file mode 100644 index 00000000..e5cd4dd5 --- /dev/null +++ b/scrapy_poet/debug.log @@ -0,0 +1 @@ +/home/k/.pyenv/versions/3.7.9/bin/python3: can't open file 'multiple_spider_in_one_process.py': [Errno 2] No such file or directory diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 18baf3e3..c2ce2854 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -198,7 +198,7 @@ class HttpClientProvider(PageObjectInputProvider): provided_classes = {HttpClient} def __call__(self, to_provide: Set[Callable], crawler: Crawler): - """Creates an ``web_poet.requests.HttpClient``` instance using Scrapy's + """Creates an ``web_poet.requests.HttpClient`` instance using Scrapy's downloader. """ backend = create_scrapy_backend(crawler.engine.download) @@ -211,6 +211,6 @@ class MetaProvider(PageObjectInputProvider): def __call__(self, to_provide: Set[Callable], request: Request): """Creates a ``web_poet.requests.Meta`` instance based on the data found - from the ``meta["po_args"]`` field of a ``scrapy.http.Response``instance. + from the ``meta["po_args"]`` field of a ``scrapy.http.Response`` instance. """ return [Meta(**request.meta.get("po_args", {}))] diff --git a/setup.py b/setup.py index e71e75a7..3a1918cb 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'andi >= 0.4.1', 'attrs', 'parsel', - 'web-poet', + 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@meta#egg=web-poet', 'tldextract', 'sqlitedict', ], diff --git a/tox.ini b/tox.ini index 00c3eb35..fa4614d8 100644 --- a/tox.ini +++ b/tox.ini @@ -9,10 +9,9 @@ deps = pytest pytest-cov pytest-asyncio - scrapy >= 2.1.0 + scrapy >= 2.6.0 pytest-twisted web-poet @ git+https://git@github.com/scrapinghub/web-poet@meta#egg=web-poet - scrapy @ git+https://github.com/scrapy/scrapy.git@30d5779#egg=scrapy commands = py.test \