From 17d9f95e412ad93bc7cb7e5d307d3e0e94b0e675 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 15:27:42 +0300 Subject: [PATCH 01/22] remove old file --- zyte_spider_templates/pages/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 zyte_spider_templates/pages/__init__.py diff --git a/zyte_spider_templates/pages/__init__.py b/zyte_spider_templates/pages/__init__.py deleted file mode 100644 index 72b9c1c..0000000 --- a/zyte_spider_templates/pages/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .product_navigation_heuristics import HeuristicsProductNavigationPage From c3fa8437cd0fa3a8fb6e264a914141ab801c460d Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 15:57:56 +0300 Subject: [PATCH 02/22] merge from article repo --- docs/conf.py | 1 + docs/index.rst | 10 +- docs/reference/api.rst | 80 ++ docs/reference/index.rst | 61 - docs/reference/reqmeta.rst | 112 ++ docs/reference/settings.rst | 194 +++ docs/setup.rst | 45 +- docs/templates/article.rst | 35 + docs/templates/e-commerce.rst | 11 +- docs/templates/index.rst | 3 + setup.py | 7 +- .../incremental/test_collection_fp_manager.py | 207 ++++ tests/incremental/test_incremental_manager.py | 366 ++++++ tests/incremental/test_middleware.py | 96 ++ .../test_article_navigation_heuristics.py | 214 ++++ .../test_product_navigation_heuristics.py | 6 +- tests/test_addon.py | 156 +++ tests/test_article.py | 606 +++++++++ tests/test_feeds.py | 113 ++ tests/test_heuristics.py | 233 +++- tests/test_middlewares.py | 1081 ++++++++++++++++- tests/test_search.py | 5 +- tests/test_utils.py | 25 +- tox.ini | 12 +- zyte_spider_templates/__init__.py | 13 + zyte_spider_templates/_addon.py | 170 +++ .../_incremental/__init__.py | 0 zyte_spider_templates/_incremental/manager.py | 276 +++++ .../_incremental/middleware.py | 70 ++ zyte_spider_templates/feeds.py | 56 + zyte_spider_templates/heuristics.py | 165 +++ zyte_spider_templates/middlewares.py | 540 +++++++- zyte_spider_templates/pages/__init__.py | 2 + .../pages/article_heuristics.py | 189 +++ zyte_spider_templates/params.py | 12 + zyte_spider_templates/spiders/article.py | 415 +++++++ zyte_spider_templates/utils.py | 90 +- 37 files changed, 5556 insertions(+), 121 deletions(-) create mode 100644 docs/reference/api.rst create mode 100644 docs/reference/reqmeta.rst create mode 100644 docs/reference/settings.rst create mode 100644 docs/templates/article.rst create mode 100644 tests/incremental/test_collection_fp_manager.py create mode 100644 tests/incremental/test_incremental_manager.py create mode 100644 tests/incremental/test_middleware.py create mode 100644 tests/pages/test_article_navigation_heuristics.py create mode 100644 tests/test_addon.py create mode 100644 tests/test_article.py create mode 100644 tests/test_feeds.py create mode 100644 zyte_spider_templates/_addon.py create mode 100644 zyte_spider_templates/_incremental/__init__.py create mode 100644 zyte_spider_templates/_incremental/manager.py create mode 100644 zyte_spider_templates/_incremental/middleware.py create mode 100644 zyte_spider_templates/feeds.py create mode 100644 zyte_spider_templates/pages/__init__.py create mode 100644 zyte_spider_templates/pages/article_heuristics.py create mode 100644 zyte_spider_templates/spiders/article.py diff --git a/docs/conf.py b/docs/conf.py index 0d0a14e..f56d633 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,6 +70,7 @@ autodoc_pydantic_model_show_validator_members = False autodoc_pydantic_model_show_validator_summary = False autodoc_pydantic_field_list_validators = False +autodoc_pydantic_field_show_constraints = False # sphinx-reredirects redirects = { diff --git a/docs/index.rst b/docs/index.rst index dd568ea..56d8c3b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,7 @@ zyte-spider-templates documentation templates/index E-commerce + Article Google search .. toctree:: @@ -34,9 +35,16 @@ zyte-spider-templates documentation customization/spiders customization/pages +.. toctree:: + :caption: Reference + :hidden: + + reference/settings + reference/reqmeta + reference/api + .. toctree:: :caption: All the rest :hidden: - reference/index changes diff --git a/docs/reference/api.rst b/docs/reference/api.rst new file mode 100644 index 0000000..c20416a --- /dev/null +++ b/docs/reference/api.rst @@ -0,0 +1,80 @@ +=== +API +=== + +Spiders +======= + +.. autoclass:: zyte_spider_templates.BaseSpider + +.. autoclass:: zyte_spider_templates.EcommerceSpider + +.. autoclass:: zyte_spider_templates.GoogleSearchSpider + + +Pages +===== + +.. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage + + +.. _parameter-mixins: + +Parameter mixins +================ + +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.params.CustomAttrsMethod + +.. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.params.ExtractFrom + +.. autopydantic_model:: zyte_spider_templates.params.GeolocationParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.params.Geolocation + +.. autopydantic_model:: zyte_spider_templates.params.MaxRequestsParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.params.UrlParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategyParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType + +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleCrawlStrategyParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy + + +.. _middlewares: + +Middlewares +=========== + +.. autoclass:: zyte_spider_templates.CrawlingLogsMiddleware +.. autoclass:: zyte_spider_templates.TrackNavigationDepthSpiderMiddleware +.. autoclass:: zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware +.. autoclass:: zyte_spider_templates.OffsiteRequestsPerSeedMiddleware +.. autoclass:: zyte_spider_templates.OnlyFeedsMiddleware +.. autoclass:: zyte_spider_templates.TrackSeedsSpiderMiddleware +.. autoclass:: zyte_spider_templates.IncrementalCrawlMiddleware diff --git a/docs/reference/index.rst b/docs/reference/index.rst index a7862f1..e69de29 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -1,61 +0,0 @@ -========= -Reference -========= - -Spiders -======= - -.. autoclass:: zyte_spider_templates.BaseSpider - -.. autoclass:: zyte_spider_templates.EcommerceSpider - -.. autoclass:: zyte_spider_templates.GoogleSearchSpider - - -Pages -===== - -.. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage - - -.. _parameter-mixins: - -Parameter mixins -================ - -.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam - :exclude-members: model_computed_fields - -.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam - :exclude-members: model_computed_fields - -.. autoenum:: zyte_spider_templates.params.CustomAttrsMethod - -.. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam - :exclude-members: model_computed_fields - -.. autoenum:: zyte_spider_templates.params.ExtractFrom - -.. autopydantic_model:: zyte_spider_templates.params.GeolocationParam - :exclude-members: model_computed_fields - -.. autoenum:: zyte_spider_templates.params.Geolocation - -.. autopydantic_model:: zyte_spider_templates.params.MaxRequestsParam - :exclude-members: model_computed_fields - -.. autopydantic_model:: zyte_spider_templates.params.UrlParam - :exclude-members: model_computed_fields - -.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategyParam - :exclude-members: model_computed_fields - -.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy - -.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam - :exclude-members: model_computed_fields - -.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType - -.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam - :exclude-members: model_computed_fields diff --git a/docs/reference/reqmeta.rst b/docs/reference/reqmeta.rst new file mode 100644 index 0000000..53ae847 --- /dev/null +++ b/docs/reference/reqmeta.rst @@ -0,0 +1,112 @@ +.. _meta: + +================= +Request.meta keys +================= + +Keys that can be defined in :attr:`Request.meta ` for +zyte-spider-templates. + +.. reqmeta:: seed + +seed +==== + +Default: ``The seed URL (or value) from which the request originated.`` + +The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware` and +:class:`~zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware`. + +The `seed` meta key is used to track and identify the origin of a request. It +is initially set for each request that originates from the start request and +can be used to manage domain constraints for subsequent requests. This key can +also be set to an arbitrary value by the user to identify the seed source. + +Here's an example: + +.. code-block:: python + + meta = { + "seed": "http://example.com", + } + +.. reqmeta:: is_seed_request + +is_seed_request +=============== + +Default: ``False`` + +The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware`. + +The `is_seed_request` meta key is a boolean flag that identifies whether the +request is a start request (i.e., originating from the initial seed URL). When +set to True, the middleware extracts seed domains from the response. + +Example: + :: + + meta = { + 'is_seed_request': True, + } + +.. reqmeta:: seed_domains + +seed_domains +============ + +Default: ``Initial URL and redirected URLs`` + +The key is used for :class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware`. + +The `seed_domains` meta key is a list of domains that the middleware uses to +check whether a request belongs to these domains or not. By default, this list +includes the initial URL's domain and domains of any redirected URLs `(if there +was a redirection)`. This list can also be set by the user in the spider to +specify additional domains for which the middleware should allow requests. + +Here's an example: + +.. code-block:: python + + meta = {"seed_domains": ["example.com", "another-example.com"]} + +.. reqmeta:: is_hop + +increase_navigation_depth +========================= + +Default: ``True`` + +The key is used for :class:`~zyte_spider_templates.TrackNavigationDepthSpiderMiddleware`. + +The `increase_navigation_depth` meta key is a boolean flag that determines whether the +navigation_depth for a request should be increased. By default, the middleware increases +navigation_depth for all requests. Specific spiders can override this behavior for certain +types of requests, such as pagination or RSS feeds, by explicitly setting the meta key. + +Example: + :: + + meta = { + 'increase_navigation_depth': False, + } + +.. reqmeta:: only_feeds + +only_feeds +========== +Default: ``False`` + +The key is used for :class:`~zyte_spider_templates.OnlyFeedsMiddleware`. + +The `only_feeds` meta key is a boolean flag that identifies whether the +spider should discover all links on the website or extract links from RSS/Atom feeds only. + +Example: + :: + + meta = { + 'page_params': {'only_feeds': True} + } + diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst new file mode 100644 index 0000000..9fc83f2 --- /dev/null +++ b/docs/reference/settings.rst @@ -0,0 +1,194 @@ +.. _settings: + +======== +Settings +======== + +.. setting:: NAVIGATION_DEPTH_LIMIT + +NAVIGATION_DEPTH_LIMIT +====================== + +Default: ``0`` + +The maximum navigation depth to crawl. If ``0``, no limit is imposed. + +We increase *navigation_depth* for requests navigating to a subcategory originating from +its parent category, including a request targeting a category starting at the website home page. +We don't increase *navigation_depth* for requests accessing item details (e.g., an article) or for +additional pages of a visited webpage. For example, if you set ``NAVIGATION_DEPTH_LIMIT`` to ``1``, +only item details and pagination links from your start URLs are followed. + +.. note:: + Currently, only the :ref:`Article spider template
` implements proper + navigation_depth support. Other spider templates treat all follow-up requests as + increasing navigation_depth. + +Setting a navigation_depth limit can prevent a spider from delving too deeply into +subcategories. This is especially useful if you only need data from the +top-level categories or specific subcategories. + +When :ref:`customizing a spider template `, set the +:reqmeta:`increase_navigation_depth` request metadata key to override whether a request is +considered as increasing navigation depth (``True``) or not (``False``): + +.. code-block:: python + + Request("https://example.com", meta={"increase_navigation_depth": False}) + +If you want to limit all link following, including pagination and item details, +consider using the :setting:`DEPTH_LIMIT ` setting instead. + +Implemented by :class:`~zyte_spider_templates.TrackNavigationDepthSpiderMiddleware`. + +.. setting:: MAX_REQUESTS_PER_SEED + +MAX_REQUESTS_PER_SEED +===================== + +.. tip:: When using the :ref:`article spider template
`, you may use + the + :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.max_requests_per_seed` + command-line parameter instead of this setting. + +Default: ``0`` + +Limit the number of follow-up requests per initial URL to the specified amount. +Non-positive integers (i.e. 0 and below) imposes no limit and disables this middleware. + +The limit is the total limit for all direct and indirect follow-up requests +of each initial URL. + +Implemented by +:class:`~zyte_spider_templates.MaxRequestsPerSeedDownloaderMiddleware`. + +.. setting:: OFFSITE_REQUESTS_PER_SEED_ENABLED + +OFFSITE_REQUESTS_PER_SEED_ENABLED +================================= + +Default: ``True`` + +Setting this value to ``True`` enables the +:class:`~zyte_spider_templates.OffsiteRequestsPerSeedMiddleware` while ``False`` +completely disables it. + +The middleware ensures that *most* requests would belong to the domain of the +seed URLs. However, it does allow offsite requests only if they were obtained +from a response that belongs to the domain of the seed URLs. Any other requests +obtained thereafter from a response in a domain outside of the seed URLs will +not be allowed. + +This prevents the spider from completely crawling other domains while ensuring +that aggregator websites *(e.g. a news website with articles from other domains)* +are supported, as it can access pages from other domains. + +Disabling the middleware would not prevent offsite requests from being filtered +and might generally lead in other domains from being crawled completely, unless +``allowed_domains`` is set in the spider. + +.. note:: + + If a seed URL gets redirected to a different domain, both the domain from + the original request and the domain from the redirected response will be + used as references. + + If the seed URL is `https://books.toscrape.com`, all subsequent requests to + `books.toscrape.com` and its subdomains are allowed, but requests to + `toscrape.com` are not. Conversely, if the seed URL is `https://toscrape.com`, + requests to both `toscrape.com` and `books.toscrape.com` are allowed. + +.. setting:: ONLY_FEEDS_ENABLED + +ONLY_FEEDS_ENABLED +================== + +.. note:: + + Only works for the :ref:`article spider template
`. + +Default: ``False`` + +Whether to extract links from Atom and RSS news feeds only (``True``) or +to also use extracted links from ``ArticleNavigation.subCategories`` (``False``). + +Implemented by :class:`~zyte_spider_templates.OnlyFeedsMiddleware`. + +.. setting:: INCREMENTAL_CRAWL_BATCH_SIZE + +INCREMENTAL_CRAWL_BATCH_SIZE +============================ + +Default: ``50`` + +The maximum number of seen URLs to read from or write to the corresponding +:ref:`Zyte Scrapy Cloud collection ` per request during an incremental +crawl (see :setting:`INCREMENTAL_CRAWL_ENABLED`). + +This setting determines the batch size for interactions with the Collection. +If the response from a webpage contains more than 50 URLs, they will be split +into smaller batches for processing. Conversely, if fewer than 50 URLs are present, +all URLs will be handled in a single request to the Collection. + +Adjusting this value can optimize the performance of a crawl by balancing the number +of requests sent to the Collection with processing efficiency. + +.. note:: + + Setting it too large (e.g. > 100) will cause issues due to the large query length. + Setting it too small (less than 10) will remove the benefit of using a batch. + +Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`. + + +.. setting:: INCREMENTAL_CRAWL_COLLECTION_NAME + +INCREMENTAL_CRAWL_COLLECTION_NAME +================================= + +.. note:: + + :ref:`virtual spiders ` are spiders based on :ref:`spider templates `. + The explanation of using INCREMENTAL_CRAWL_COLLECTION_NAME related to both types of spiders. + +.. tip:: When using the :ref:`article spider template
`, you may use + the + :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental_collection_name` + command-line parameter instead of this setting. + + +Default: `_incremental`. +The current spider's name here will be virtual spider's name, if it's a virtual spider; +otherwise, :data:`Spider.name `. + +Name of the :ref:`Zyte Scrapy Cloud collection ` used during +an incremental crawl (see :setting:`INCREMENTAL_CRAWL_ENABLED`). + +By default, a collection named after the spider is used, meaning that matching URLs from +previous runs of the same spider are skipped, provided those previous runs had +the :setting:`INCREMENTAL_CRAWL_ENABLED` setting set to ``True`` or the spider +argument `incremental` set to `true`. + +Using a different collection name makes sense, for example, in the following cases: +- Different spiders share a collection. +- The same spider uses different collections (e.g., for development runs vs. production runs). + +Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`. + + +.. setting:: INCREMENTAL_CRAWL_ENABLED + +INCREMENTAL_CRAWL_ENABLED +========================= + +.. tip:: When using the :ref:`article spider template
`, you may use + the + :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental` + command-line parameter instead of this setting. + +Default: ``False`` + +If set to ``True``, items seen in previous crawls with the same +:setting:`INCREMENTAL_CRAWL_COLLECTION_NAME` value are skipped. + +Implemented by :class:`~zyte_spider_templates.IncrementalCrawlMiddleware`. diff --git a/docs/setup.rst b/docs/setup.rst index 6c35da4..7ff1488 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -40,47 +40,32 @@ Configuration In your Scrapy project settings (usually in ``settings.py``): -- Update :setting:`SPIDER_MODULES ` to include - ``"zyte_spider_templates.spiders"``. - -- `Configure scrapy-poet`_, and update :ref:`SCRAPY_POET_DISCOVER - ` to include - ``"zyte_spider_templates.pages"``. +#. `Configure scrapy-poet`_. .. _Configure scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project -For Zyte API features, including AI-powered parsing, `configure -scrapy-zyte-api`_ with `scrapy-poet integration`_. - -.. _configure scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api#quick-start -.. _scrapy-poet integration: https://github.com/scrapy-plugins/scrapy-zyte-api#scrapy-poet-integration - -The following additional settings are recommended: - -- Set :setting:`CLOSESPIDER_TIMEOUT_NO_ITEM - ` to 600, to force the spider to stop - if no item has been found for 10 minutes. +#. For Zyte API features, including AI-powered parsing, :ref:`configure + scrapy-zyte-api `. -- Set :setting:`SCHEDULER_DISK_QUEUE ` to - ``"scrapy.squeues.PickleFifoDiskQueue"`` and - :setting:`SCHEDULER_MEMORY_QUEUE ` to - ``"scrapy.squeues.FifoMemoryQueue"``, for better request priority handling. +#. Configure :class:`zyte_common_items.ZyteItemAdapter`: -- Update :setting:`SPIDER_MIDDLEWARES ` to include - ``"zyte_spider_templates.middlewares.CrawlingLogsMiddleware": 1000``, to - log crawl data in JSON format for debugging purposes. - -- Ensure that :class:`zyte_common_items.ZyteItemAdapter` is also configured:: + .. code-block:: python + :caption: ``settings.py`` from itemadapter import ItemAdapter from zyte_common_items import ZyteItemAdapter ItemAdapter.ADAPTER_CLASSES.appendleft(ZyteItemAdapter) -- Update :setting:`SPIDER_MIDDLEWARES ` to include - ``"zyte_spider_templates.middlewares.AllowOffsiteMiddleware": 500`` and - ``"scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None``. This allows for - crawling item links outside of the domain. +#. Add the zyte-spider-templates add-on to your :setting:`ADDONS + ` setting: + + .. code-block:: python + :caption: ``settings.py`` + + ADDONS = { + "zyte_spider_templates.Addon": 1000, + } For an example of a properly configured ``settings.py`` file, see `the one in zyte-spider-templates-project`_. diff --git a/docs/templates/article.rst b/docs/templates/article.rst new file mode 100644 index 0000000..a20b195 --- /dev/null +++ b/docs/templates/article.rst @@ -0,0 +1,35 @@ +.. _article: + +===================================== +Article spider template (``article``) +===================================== + +Basic use +========= + +.. code-block:: shell + + scrapy crawl article -a url="https://www.zyte.com/blog/" + +Parameters +========== + +.. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleSpiderParams + :inherited-members: BaseModel + :exclude-members: model_computed_fields, single_input + +Settings +======== + +The following :ref:`zyte-spider-templates settings ` may be useful +for the article spider template: + +:setting:`NAVIGATION_DEPTH_LIMIT` + Limit the crawling depth of subcategories. + +:setting:`OFFSITE_REQUESTS_PER_SEED_ENABLED` + Skip follow-up requests if their URL points to a domain different from the + domain of their initial URL. + +:setting:`ONLY_FEEDS_ENABLED` + Extract links only from Atom and RSS news feeds. diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst index 5fd5dd6..c3ebf96 100644 --- a/docs/templates/e-commerce.rst +++ b/docs/templates/e-commerce.rst @@ -16,4 +16,13 @@ Parameters .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams :inherited-members: BaseModel - :exclude-members: model_computed_fields \ No newline at end of file + :exclude-members: model_computed_fields, single_input + +Settings +======== + +The following :ref:`zyte-spider-templates settings ` may be useful +for the e-commerce spider template: + +:setting:`MAX_REQUESTS_PER_SEED` + Limit the number of follow-up requests per initial URL. diff --git a/docs/templates/index.rst b/docs/templates/index.rst index ea86c6d..0f2ce76 100644 --- a/docs/templates/index.rst +++ b/docs/templates/index.rst @@ -32,3 +32,6 @@ Spider template list :ref:`Google Search ` Get Google search results. + +:ref:`Article
` + Get articles from websites. \ No newline at end of file diff --git a/setup.py b/setup.py index 869b1f5..95696e6 100644 --- a/setup.py +++ b/setup.py @@ -12,17 +12,22 @@ packages=find_packages(), include_package_data=True, install_requires=[ + "duplicate-url-discarder>=0.2.0", + "duplicate-url-discarder-rules>=2024.11.05", "extruct>=0.18.0", + "feedparser>=6.0.11", "form2request>=0.2.0", "formasaurus>=0.10.0", "jmespath>=0.9.5", "pydantic>=2.1", - "requests>=1.0.0", + "requests>=2.31.0", + "scrapinghub >= 2.4.0", "scrapy>=2.11.0", "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", "web-poet>=0.17.1", + "xtractmime>=0.2.1", "zyte-common-items>=0.26.2", ], classifiers=[ diff --git a/tests/incremental/test_collection_fp_manager.py b/tests/incremental/test_collection_fp_manager.py new file mode 100644 index 0000000..902631a --- /dev/null +++ b/tests/incremental/test_collection_fp_manager.py @@ -0,0 +1,207 @@ +from unittest.mock import MagicMock, patch + +import pytest +from scrapy.statscollectors import StatsCollector +from scrapy.utils.request import RequestFingerprinter + +from tests import get_crawler +from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager +from zyte_spider_templates.spiders.article import ArticleSpider + + +@pytest.fixture +def mock_crawler(): + return MagicMock() + + +def crawler_for_incremental(): + url = "https://example.com" + crawler = get_crawler() + crawler.request_fingerprinter = RequestFingerprinter() + crawler.stats = StatsCollector(crawler) + crawler.spider = ArticleSpider.from_crawler(crawler, url=url) + crawler.settings["ZYTE_PROJECT_ID"] = "000000" + return crawler + + +@pytest.mark.parametrize("batch_size", [50, 2]) +@pytest.mark.parametrize( + "fingerprints, keys_in_collection, fingerprints_batch, expected_result", + [ + ([], [], {"fp1", "fp2", "fp3"}, set()), + (["fp1", "fp2", "fp3"], [], set(), set()), + (["fp1", "fp2", "fp3"], ["fp1"], set(), {"fp1"}), + (["fp1", "fp2", "fp3"], ["fp1", "fp2"], set(), {"fp1", "fp2"}), + (["fp1", "fp2", "fp3"], ["fp1", "fp2", "fp3"], set(), {"fp1", "fp2", "fp3"}), + ( + ["fp1", "fp2", "fp3"], + ["fp1", "fp2"], + {("fp3", "url3")}, + {"fp1", "fp2", "fp3"}, + ), + (["fp1", "fp2", "fp3"], [], {("fp3", "url3")}, {"fp3"}), + ], +) +@patch("scrapinghub.ScrapinghubClient") +@pytest.mark.asyncio +async def test_get_existing_fingerprints( + mock_scrapinghub_client, + batch_size, + fingerprints, + keys_in_collection, + fingerprints_batch, + expected_result, +): + mock_client = MagicMock() + mock_scrapinghub_client.return_value = mock_client + + mock_collection = MagicMock() + mock_collection.count.return_value = 0 + mock_client.get_project.return_value.collections.get_store.return_value = ( + mock_collection + ) + + mock_crawler = MagicMock() + mock_crawler.settings.getint.return_value = batch_size + + mock_manager = CollectionsFingerprintsManager(mock_crawler) + mock_manager.get_keys_from_collection = MagicMock(return_value=keys_in_collection) # type: ignore + mock_manager.batch = fingerprints_batch + + assert ( + await mock_manager.get_existing_fingerprints_async(fingerprints) + == expected_result + ) + + +@pytest.mark.parametrize( + "fingerprints, expected_keys", + [ + ({"fp1", "fp2", "fp3"}, {"fp1", "fp2", "fp3"}), + ({}, set()), + ], +) +@patch("scrapinghub.ScrapinghubClient") +def test_get_keys_from_collection(mock_crawler, fingerprints, expected_keys): + mock_collection = MagicMock() + mock_collection.list.return_value = [ + {"_key": key, "value": {}} for key in expected_keys + ] + mock_crawler.settings.getint.return_value = 50 + manager = CollectionsFingerprintsManager(mock_crawler) + manager.collection = mock_collection # type: ignore + assert manager.get_keys_from_collection(fingerprints) == expected_keys + + +@pytest.mark.parametrize( + "keys, expected_items_written", + [ + ( + [("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], + [("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], + ), + ([], []), + ], +) +@patch("scrapinghub.ScrapinghubClient") +def test_save_to_collection(mock_crawler, keys, expected_items_written): + mock_writer = MagicMock() + mock_writer.write.return_value = expected_items_written + mock_crawler.settings.getint.return_value = 50 + manager = CollectionsFingerprintsManager(mock_crawler) + manager.writer = mock_writer # type: ignore + manager.save_to_collection(keys) + mock_writer.write.assert_called_once_with( + [{"_key": key, "value": value} for key, value in keys] + ) + + +@pytest.mark.parametrize( + "fingerprints, expected_batch, batch_size", + [ + ( + [(f"fp{i}", f"url{i}") for i in range(1, 5)], + {("fp4", "url4")}, + 3, + ), # No default min + ([], set(), 20), + ([("fp1", "url1")] * 19, {("fp1", "url1")}, 20), + ( + [(f"fp{i}", f"url{i}") for i in range(1, 103)], + {(f"fp{i}", f"url{i}") for i in range(1, 103)}, + 150, + ), # No default max + ( + [(f"fp{i}", f"url{i}") for i in range(1, 53)], + [("fp51", "url51"), ("fp52", "url52")], + 0, + ), # 50 by default + ], +) +@patch("scrapinghub.ScrapinghubClient") +def test_save_fingerprints( + mock_scrapinghub_client, fingerprints, expected_batch, batch_size +): + crawler = crawler_for_incremental() + if batch_size != 0: + crawler.settings.set("INCREMENTAL_CRAWL_BATCH_SIZE", batch_size) + fp_manager = CollectionsFingerprintsManager(crawler) + fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch) # type: ignore + fp_manager.add_to_batch(fingerprints) + assert fp_manager.batch == set(sorted(expected_batch, key=lambda x: int(x[0][2:]))) + + if len(fingerprints) >= fp_manager.batch_size: + fp_manager.save_batch.assert_called_once() + else: + fp_manager.save_batch.assert_not_called() + + +@pytest.mark.parametrize( + "fingerprints_batch, expected_batch_size", + [ + ([], 0), + ([("fp1", "url1"), ("fp2", "url2"), ("fp3", "url3")], 0), + ], +) +@patch("scrapinghub.ScrapinghubClient") +def test_save_batch(mock_crawler, fingerprints_batch, expected_batch_size): + crawler = crawler_for_incremental() + fp_manager = CollectionsFingerprintsManager(crawler) + fp_manager.batch = set(fingerprints_batch) + fp_manager.save_batch() + assert len(fp_manager.batch) == expected_batch_size + + +@pytest.mark.parametrize( + "project_id, collection_name, expected_collection", + [ + ("project1", "collection1", MagicMock()), + ("project2", "collection2", MagicMock()), + ], +) +@patch("scrapinghub.ScrapinghubClient") +def test_init_collection( + mock_scrapinghub_client, + mock_crawler, + project_id, + collection_name, + expected_collection, +): + mock_scrapinghub_instance = MagicMock() + mock_get_project = MagicMock() + mock_get_project.collections.get_store.return_value = expected_collection + mock_scrapinghub_instance.get_project.return_value = mock_get_project + mock_scrapinghub_client.return_value = mock_scrapinghub_instance + mock_crawler.settings.getint.return_value = 50 + manager = CollectionsFingerprintsManager(mock_crawler) + manager.init_collection(project_id, collection_name) + assert manager.collection == expected_collection + + +@patch("scrapinghub.ScrapinghubClient") +def test_spider_closed(mock_scrapinghub_client): + crawler = crawler_for_incremental() + fp_manager = CollectionsFingerprintsManager(crawler) + fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch) # type: ignore + fp_manager.spider_closed() + fp_manager.save_batch.assert_called_once() diff --git a/tests/incremental/test_incremental_manager.py b/tests/incremental/test_incremental_manager.py new file mode 100644 index 0000000..2498464 --- /dev/null +++ b/tests/incremental/test_incremental_manager.py @@ -0,0 +1,366 @@ +from unittest.mock import patch + +import pytest +from scrapy.statscollectors import StatsCollector +from scrapy.utils.request import RequestFingerprinter +from zyte_common_items import Article + +from tests import get_crawler +from zyte_spider_templates import ArticleSpider +from zyte_spider_templates._incremental.manager import ( + CollectionsFingerprintsManager, + IncrementalCrawlingManager, + Request, +) + + +def crawler_for_incremental(): + url = "https://example.com" + crawler = get_crawler() + crawler.settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7" + crawler.request_fingerprinter = RequestFingerprinter(crawler) + crawler.stats = StatsCollector(crawler) + crawler.spider = ArticleSpider.from_crawler(crawler, url=url) + crawler.settings["ZYTE_PROJECT_ID"] = "000000" + return crawler + + +@patch("scrapinghub.ScrapinghubClient") +@pytest.mark.parametrize( + "input_request, input_result, expected_result, duplicated_fingerprints_result, expected_stats", + [ + ( + Request(url="https://example.com/article.html"), + [], + [], + set(), + {"incremental_crawling/filtered_items_and_requests": 0}, + ), # no results + ( + Request(url="https://example.com/article.html"), + [Article(url="https://example.com/article.html")], + [Article(url="https://example.com/article.html")], + set(), + { + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/fingerprint_url_to_batch": 1, + "incremental_crawling/add_to_batch": 1, + }, + ), # Only one Item in the result without redirected URL + ( + Request(url="https://example.com/article.html"), + [Article(url="https://example.com/article1.html")], + [Article(url="https://example.com/article1.html")], + set(), + { + "incremental_crawling/redirected_urls": 1, + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/fingerprint_url_to_batch": 2, + "incremental_crawling/add_to_batch": 1, + }, + ), # Only one Item with redirected URL in the result + ( + Request(url="https://example.com/article.html"), + [ + Article( + url="https://example.com/article1.html", + canonicalUrl="https://example.com/article1.html", + ) + ], + [Article(url="https://example.com/article1.html")], + set(), + { + "incremental_crawling/redirected_urls": 1, + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/fingerprint_url_to_batch": 2, + "incremental_crawling/add_to_batch": 1, + }, + ), # Only one Item with redirected URL and the same canonicalURL in the result + ( + Request(url="https://example.com/article.html"), + [ + Article( + url="https://example.com/article1.html", + canonicalUrl="https://example.com/article2.html", + ) + ], + [Article(url="https://example.com/article1.html")], + set(), + { + "incremental_crawling/redirected_urls": 1, + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/fingerprint_url_to_batch": 3, + "incremental_crawling/add_to_batch": 1, + }, + ), # Only one Item with redirected URL and the different canonicalURL in the result + ( + Request(url="https://example.com/article.html"), + [ + Article( + url="https://example.com/article.html", + canonicalUrl="https://example.com/article1.html", + ) + ], + [Article(url="https://example.com/article.html")], + set(), + { + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/fingerprint_url_to_batch": 2, + "incremental_crawling/add_to_batch": 1, + }, + ), # Only one Item with only canonicalURL different in the result + ( + Request(url="https://example.com/article.html"), + [ + Article( + url="https://example.com/article.html", + canonicalUrl="https://example.com/article.html", + ) + ], + [Article(url="https://example.com/article.html")], + set(), + { + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/fingerprint_url_to_batch": 1, + "incremental_crawling/add_to_batch": 1, + }, + ), # Only one Item with all equal URLS in the result + ( + Request(url="https://example.com/list.html"), + [Request(url="https://example.com/article1.html")], + [Request(url="https://example.com/article1.html")], + set(), + { + "incremental_crawling/filtered_items_and_requests": 0, + "incremental_crawling/requests_to_check": 1, + }, + ), # Only one Request in the result, no Items, no existing fingerprints in the cache + ( + Request(url="https://example.com/list.html"), + [ + Request(url="https://example.com/article1.html"), + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + ], + [ + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + ], + { + ( + "c300aee49364a341855bb1b08fa010497d4220016642", + "https://example.com/article1.html", + ) + }, + { + "incremental_crawling/filtered_items_and_requests": 1, + "incremental_crawling/requests_to_check": 3, + }, + ), # Three Requests in the result, no Items, one existing fingerprint in the cache + ( + Request(url="https://example.com/list.html"), + [ + Request(url="https://example.com/article1.html"), + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + ], + [ + Request(url="https://example.com/article2.html"), + ], + { + ( + "c300aee49364a341855bb1b08fa010497d4220016642", + "https://example.com/article1.html", + ), + ( + "c3001e046b02318004f724350aa3c9d3c6693a0ee4a9", + "https://example.com/article3.html", + ), + }, + { + "incremental_crawling/filtered_items_and_requests": 2, + "incremental_crawling/requests_to_check": 3, + }, + ), # Three Requests in the result, no Items, two existing fingerprints in the cache + ( + Request(url="https://example.com/list.html"), + [ + Request(url="https://example.com/article1.html"), + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + ], + [], + { + ( + "c300aee49364a341855bb1b08fa010497d4220016642", + "https://example.com/article1.html", + ), + ( + "c3001e046b02318004f724350aa3c9d3c6693a0ee4a9", + "https://example.com/article3.html", + ), + ( + "c30004342f6c6f4a8f25c2d615f524af1fe266894be8", + "https://example.com/article2.html", + ), + }, + { + "incremental_crawling/filtered_items_and_requests": 3, + "incremental_crawling/requests_to_check": 3, + }, + ), # Three Requests in the result, no Items, three existing fingerprints in the cache + ( + Request(url="https://example.com/article.html"), + [ + Request(url="https://example.com/article1.html"), + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + Article(url="https://example.com/article.html"), + ], + [ + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + Article(url="https://example.com/article.html"), + ], + { + ( + "c300aee49364a341855bb1b08fa010497d4220016642", + "https://example.com/article1.html", + ) + }, + { + "incremental_crawling/filtered_items_and_requests": 1, + "incremental_crawling/requests_to_check": 3, + "incremental_crawling/fingerprint_url_to_batch": 1, + "incremental_crawling/add_to_batch": 1, + }, + ), # Three Requests and one Item without redirected URL in the result, one existing fingerprint in the cache + ( + Request(url="https://example.com/list.html"), + [ + Request(url="https://example.com/article1.html"), + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + Article(url="https://example.com/article.html"), + ], + [ + Request(url="https://example.com/article2.html"), + Request(url="https://example.com/article3.html"), + Article(url="https://example.com/article.html"), + ], + { + ( + "c300aee49364a341855bb1b08fa010497d4220016642", + "https://example.com/article1.html", + ) + }, + { + "incremental_crawling/redirected_urls": 1, + "incremental_crawling/filtered_items_and_requests": 1, + "incremental_crawling/requests_to_check": 3, + "incremental_crawling/fingerprint_url_to_batch": 2, + "incremental_crawling/add_to_batch": 1, + }, + ), # Three Requests and one Item with redirected URL in the result, one existing fingerprint in the cache + ], +) +@pytest.mark.asyncio +async def test_process_incremental( + mock_scrapinghub_client, + input_request, + input_result, + expected_result, + duplicated_fingerprints_result, + expected_stats, +): + crawler = crawler_for_incremental() + fp_manager = CollectionsFingerprintsManager(crawler) + manager = IncrementalCrawlingManager(crawler, fp_manager) + fp_manager.batch = duplicated_fingerprints_result + + processed_result = await manager.process_incremental_async( + input_request, input_result.copy() + ) + assert len(expected_result) == len(processed_result) + for expected, processed in zip(expected_result, processed_result): + assert expected.url == processed.url # type: ignore + + assert crawler.stats.get_stats() == expected_stats + + +@patch("scrapinghub.ScrapinghubClient") +@pytest.mark.asyncio +async def test_process_incremental_several_items( + mock_scrapinghub_client, +): + crawler = crawler_for_incremental() + + fp_manager = CollectionsFingerprintsManager(crawler) + manager = IncrementalCrawlingManager(crawler, fp_manager) + + input_request = Request(url="https://example.com/article.html") + input_result = [ + Request(url="https://example.com/article1.html"), + Article(url="https://example.com/article.html"), + Article(url="https://example.com/article.html"), + ] + with pytest.raises(NotImplementedError): + await manager.process_incremental_async(input_request, input_result.copy()) + + +@patch("scrapinghub.ScrapinghubClient") +@pytest.mark.parametrize( + "request_url, item, expected", + [ + ( + "https://example.com/article.html", + Article(url="https://example.com/article.html"), + {"https://example.com/article.html": "request_url"}, + ), + ( + "https://example.com/article.html", + Article(url="https://example.com/article1.html"), + { + "https://example.com/article.html": "request_url", + "https://example.com/article1.html": "url", + }, + ), + ( + "https://example.com/article.html", + Article( + url="https://example.com/article.html", + canonicalUrl="https://example.com/article.html", + ), + {"https://example.com/article.html": "request_url"}, + ), + ( + "https://example.com/article.html", + Article( + url="https://example.com/article1.html", + canonicalUrl="https://example.com/article1.html", + ), + { + "https://example.com/article.html": "request_url", + "https://example.com/article1.html": "url", + }, + ), + ( + "https://example.com/article.html", + Article( + url="https://example.com/article1.html", + canonicalUrl="https://example.com/article2.html", + ), + { + "https://example.com/article.html": "request_url", + "https://example.com/article1.html": "url", + "https://example.com/article2.html": "canonicalUrl", + }, + ), + ], +) +def test_get_unique_urls(mock_scrapinghub_client, request_url, item, expected): + crawler = crawler_for_incremental() + + fp_manager = CollectionsFingerprintsManager(crawler) + manager = IncrementalCrawlingManager(crawler, fp_manager) + assert manager._get_unique_urls(request_url, item) == expected diff --git a/tests/incremental/test_middleware.py b/tests/incremental/test_middleware.py new file mode 100644 index 0000000..6944f4b --- /dev/null +++ b/tests/incremental/test_middleware.py @@ -0,0 +1,96 @@ +from unittest.mock import patch + +import pytest +from scrapy.exceptions import CloseSpider, NotConfigured +from scrapy.http import Request, Response +from scrapy.settings import Settings +from scrapy.statscollectors import StatsCollector +from scrapy.utils.request import RequestFingerprinter + +from tests import get_crawler +from zyte_spider_templates import IncrementalCrawlMiddleware +from zyte_spider_templates._incremental.manager import IncrementalCrawlingManager +from zyte_spider_templates.spiders.article import ArticleSpider + + +def crawler_for_incremental(): + url = "https://example.com" + crawler = get_crawler() + crawler.request_fingerprinter = RequestFingerprinter() + crawler.stats = StatsCollector(crawler) + crawler.spider = ArticleSpider.from_crawler(crawler, url=url) + crawler.settings["ZYTE_PROJECT_ID"] = "000000" + return crawler + + +def test_middleware_init_not_configured(): + crawler = crawler_for_incremental() + crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": False}) + + with pytest.raises(NotConfigured) as exc_info: + IncrementalCrawlMiddleware(crawler) + assert str(exc_info.value) == ( + "IncrementalCrawlMiddleware is not enabled. Set the " + "INCREMENTAL_CRAWL_ENABLED setting to True to enable it." + ) + + +@patch("scrapinghub.ScrapinghubClient") +def test_middleware_init_configured(mock_scrapinghub_client): + crawler = crawler_for_incremental() + crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) + + middleware = IncrementalCrawlMiddleware(crawler) + assert isinstance(middleware.inc_manager, IncrementalCrawlingManager) + + +@patch("scrapinghub.ScrapinghubClient") +def test_prepare_manager_with_collection_fp_success(mock_scrapinghub_client): + crawler = crawler_for_incremental() + crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) + + manager = IncrementalCrawlMiddleware.prepare_incremental_manager(crawler) + assert isinstance(manager, IncrementalCrawlingManager) + + +def test_prepare_manager_with_collection_fp_failure(caplog): + crawler = crawler_for_incremental() + crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) + + caplog.clear() + with pytest.raises(CloseSpider) as exc_info: + IncrementalCrawlMiddleware.prepare_incremental_manager(crawler) + assert exc_info.value.reason == "incremental_crawling_middleware_collection_issue" + assert caplog.messages[-1].startswith( + "IncrementalCrawlMiddleware is enabled, but something went wrong with Collections." + ) + + +@patch("scrapinghub.ScrapinghubClient") +@pytest.mark.asyncio +async def test_middleware_process_spider_output(mock_scrapinghub_client): + crawler = crawler_for_incremental() + crawler.spider.settings = Settings({"INCREMENTAL_CRAWL_ENABLED": True}) + + middleware = IncrementalCrawlMiddleware(crawler) + request = Request(url=crawler.spider.url) + response = Response(url=crawler.spider.url, request=request) + input_result = [ + Request(url="https://example.com/1"), + Request(url="https://example.com/2"), + Request(url="https://example.com/3"), + ] + + async def async_generator(): + for item in input_result: + yield item + + processed_result_list = [] + + async for processed_item in middleware.process_spider_output( + response, async_generator(), crawler.spider + ): + processed_result_list.append(processed_item) + + for res_ex, res_proc in zip(input_result, processed_result_list): + assert res_ex == res_proc diff --git a/tests/pages/test_article_navigation_heuristics.py b/tests/pages/test_article_navigation_heuristics.py new file mode 100644 index 0000000..9bf8c3c --- /dev/null +++ b/tests/pages/test_article_navigation_heuristics.py @@ -0,0 +1,214 @@ +from unittest.mock import patch + +import pytest +from web_poet import ( + AnyResponse, + HttpResponse, + HttpResponseHeaders, + PageParams, + RequestUrl, + Stats, +) +from zyte_common_items import ProbabilityMetadata, ProbabilityRequest + +from zyte_spider_templates.pages.article_heuristics import ( + HeuristicsArticleNavigationPage, +) + + +@pytest.mark.asyncio +async def test_article_page(): + body = b""" + + +
+

Categories

+
+ UX + CSS +
+

+
+

Articles

+ + + Next Page + +

+ + + + """ + response = AnyResponse(HttpResponse("https://example.com", body)) + + rss_content = b""" + + + Sample RSS Feed + http://example.com/feed/rss.xml + This is a sample RSS feed + + Item 1 + http://example.com/item1 + Description of Item 1 + + + Item 2 + http://example.com/item2 + Description of Item 2 + + + + """ + rss_response = AnyResponse( + HttpResponse( + "https://example.com/feed/rss.xml", + rss_content, + headers=HttpResponseHeaders({"Content-Type": "text/xml"}), + ) + ) + + urls_subcategories = [ + {"url": "https://example.com/category/UX", "name": "UX"}, + {"url": "https://example.com/category/CSS", "name": "CSS"}, + {"url": "https://example.com/2024/05/modern-css", "name": "Modern CSS"}, + {"url": "https://example.com/2024/04/how-run-ux", "name": "How to run UX"}, + {"url": "https://example.com/page-2", "name": "Next Page"}, + {"url": "https://another-example.com", "name": "Link to other domain"}, + ] + requests_subcategories = [ + ProbabilityRequest( + url=subcat["url"], + name=f"[heuristics][articleNavigation][subCategories] {subcat['name']}", + headers=None, + metadata=ProbabilityMetadata(probability=0.5), + ) + for subcat in urls_subcategories + ] + + urls_feed = [ + {"url": "https://example.com/feed/rss.xml"}, + ] + requests_feed = [ + ProbabilityRequest( + url=feed["url"], + name="[heuristics][articleNavigation][feed] ", + headers=None, + metadata=ProbabilityMetadata(probability=1.0), + ) + for feed in urls_feed + ] + + feed_items = ["http://example.com/item1", "http://example.com/item2"] + + urls_items = [ + {"url": "https://example.com/category/UX", "name": "UX"}, + {"url": "https://example.com/category/CSS", "name": "CSS"}, + {"url": "https://example.com/2024/05/modern-css", "name": "Modern CSS"}, + {"url": "https://example.com/2024/04/how-run-ux", "name": "How to run UX"}, + {"url": "https://example.com/page-2", "name": "Next Page"}, + {"url": "https://another-example.com", "name": "Link to other domain"}, + ] + requests_items = [ + ProbabilityRequest( + url=item["url"], + name=f"[heuristics][articleNavigation][article] {item['name']}", + headers=None, + metadata=ProbabilityMetadata(probability=0.5), + ) + for item in urls_items + ] + + request_url = RequestUrl(response.url) + rss_url = RequestUrl(rss_response.url) + + # final_navigation_page = True + page_params = PageParams({"skip_subcategories": True}) + page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) + item = await page.to_item() + + assert page.skip_subcategories() + assert item.subCategories[0].url == "https://example.com/feed/rss.xml" + assert [item.url for item in item.items] == [item["url"] for item in urls_items] + + # final_navigation_page = False + page_params = PageParams({"skip_subcategories": False}) + page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) + item = await page.to_item() + + assert not page.skip_subcategories() + assert item.subCategories == requests_feed + requests_subcategories + assert item.items == requests_items + + # no final_navigation_page (False by default) + page_params = PageParams() + page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) + item = await page.to_item() + + assert not page.skip_subcategories() + assert item.subCategories == requests_feed + requests_subcategories + assert item.items == requests_items + + # only_feeds = True, request to page + page_params = PageParams({"only_feeds": True}) + page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) + item = await page.to_item() + + assert page.is_only_feeds() + assert item.subCategories[0].url == str(rss_url) + assert [item.url for item in item.items] == [] + + # only_feeds = True, request to feed + page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params) + with patch.object( + HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True + ): + item = await page.to_item() + assert page.is_only_feeds() + assert item.subCategories == [] + assert [item.url for item in item.items] == feed_items + + # only_feeds = False, request to page + page_params = PageParams({"only_feeds": False}) + page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) + item = await page.to_item() + + assert not page.is_only_feeds() + assert item.subCategories == requests_feed + requests_subcategories + assert item.items == requests_items + + # only_feeds = False, request to feed + page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params) + with patch.object( + HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True + ): + item = await page.to_item() + assert not page.is_only_feeds() + assert item.subCategories == [] + assert [item.url for item in item.items] == feed_items + + # no only_feeds (False by default) + page_params = PageParams() + page = HeuristicsArticleNavigationPage(request_url, response, Stats(), page_params) + item = await page.to_item() + + assert not page.is_only_feeds() + assert item.subCategories == requests_feed + requests_subcategories + assert item.items == requests_items + + # no only_feeds (False by default), request to feed + page = HeuristicsArticleNavigationPage(rss_url, rss_response, Stats(), page_params) + with patch.object( + HeuristicsArticleNavigationPage, "_is_response_feed", return_value=True + ): + item = await page.to_item() + assert not page.is_only_feeds() + assert item.subCategories == [] + assert [item.url for item in item.items] == feed_items diff --git a/tests/pages/test_product_navigation_heuristics.py b/tests/pages/test_product_navigation_heuristics.py index 9fd4250..5e4727e 100644 --- a/tests/pages/test_product_navigation_heuristics.py +++ b/tests/pages/test_product_navigation_heuristics.py @@ -1,5 +1,4 @@ import pytest -from pytest_twisted import ensureDeferred from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl from zyte_common_items import ProbabilityRequest, ProductNavigation @@ -8,7 +7,7 @@ ) -@ensureDeferred +@pytest.mark.asyncio async def test_unknown_product_page(): body = b""" @@ -103,7 +102,7 @@ async def test_unknown_product_page(): assert page._urls_for_category() == all_valid_urls -@ensureDeferred +@pytest.mark.asyncio async def test_crawl_nofollow_links(): page_params = PageParams({"full_domain": "example.com"}) body = b""" @@ -128,6 +127,7 @@ async def test_crawl_nofollow_links(): assert [req.url for req in page.subCategories] == ["https://example.com/can-follow"] +@pytest.mark.deprication_warning def test_deprecated_page_objects(): with pytest.warns(DeprecationWarning, match="page_objects"): from zyte_spider_templates.page_objects import ( # noqa: F401 diff --git a/tests/test_addon.py b/tests/test_addon.py new file mode 100644 index 0000000..0f9b04c --- /dev/null +++ b/tests/test_addon.py @@ -0,0 +1,156 @@ +import pytest +import scrapy +from duplicate_url_discarder_rules import RULE_PATHS +from packaging import version +from scrapy.utils.test import get_crawler +from zyte_common_items.pipelines import DropLowProbabilityItemPipeline + +from zyte_spider_templates import ( + AllowOffsiteMiddleware, + CrawlingLogsMiddleware, + IncrementalCrawlMiddleware, + MaxRequestsPerSeedDownloaderMiddleware, + OffsiteRequestsPerSeedMiddleware, + OnlyFeedsMiddleware, + TrackNavigationDepthSpiderMiddleware, + TrackSeedsSpiderMiddleware, +) + +_crawler = get_crawler() +BASELINE_SETTINGS = _crawler.settings.copy_to_dict() + +try: + from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware +except ImportError: + from scrapy.spidermiddlewares.offsite import ( # type: ignore[assignment] + OffsiteMiddleware, + ) + + +# https://github.com/scrapy-plugins/scrapy-zyte-api/blob/a1d81d11854b420248f38e7db49c685a8d46d943/tests/test_addon.py#L109 +def _test_setting_changes(initial_settings, expected_settings): + settings = { + **initial_settings, + "ADDONS": { + "zyte_spider_templates.Addon": 1000, + }, + } + crawler = get_crawler(settings_dict=settings) + crawler._apply_settings() + actual_settings = crawler.settings.copy_to_dict() + + # Test separately settings that copy_to_dict messes up. + for setting in ( + "DOWNLOADER_MIDDLEWARES", + "SCRAPY_POET_PROVIDERS", + "SPIDER_MIDDLEWARES", + "ITEM_PIPELINES", + ): + if setting not in crawler.settings: + assert setting not in expected_settings + continue + assert crawler.settings.getdict(setting) == expected_settings.pop(setting) + del actual_settings[setting] + + for key in BASELINE_SETTINGS: + if key in actual_settings and actual_settings[key] == BASELINE_SETTINGS[key]: + del actual_settings[key] + del actual_settings["ADDONS"] + + assert actual_settings == expected_settings + + +@pytest.mark.parametrize( + ("initial_settings", "expected_settings"), + ( + ( + {}, + { + "CLOSESPIDER_TIMEOUT_NO_ITEM": 600, + "DOWNLOADER_MIDDLEWARES": { + MaxRequestsPerSeedDownloaderMiddleware: 100, + OffsiteMiddleware: None, + AllowOffsiteMiddleware: 500, + }, + "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue", + "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue", + "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue", + "ITEM_PROBABILITY_THRESHOLDS": { + "zyte_common_items.items.Article": 0.1, + "zyte_common_items.items.Product": 0.1, + }, + "DUD_LOAD_RULE_PATHS": RULE_PATHS, + "SCRAPY_POET_DISCOVER": [ + "zyte_spider_templates.pages", + ], + "SPIDER_MIDDLEWARES": { + IncrementalCrawlMiddleware: 45, + OffsiteRequestsPerSeedMiddleware: 49, + OnlyFeedsMiddleware: 108, + TrackNavigationDepthSpiderMiddleware: 110, + TrackSeedsSpiderMiddleware: 550, + CrawlingLogsMiddleware: 1000, + }, + "ITEM_PIPELINES": { + DropLowProbabilityItemPipeline: 0, + }, + "SPIDER_MODULES": [ + "zyte_spider_templates.spiders", + ], + }, + ), + ), +) +@pytest.mark.skipif( + version.parse(scrapy.__version__) < version.parse("2.11.2"), + reason="Test applicable only for Scrapy versions >= 2.11.2", +) +def test_poet_setting_changes_since_scrapy_2_11_2(initial_settings, expected_settings): + _test_setting_changes(initial_settings, expected_settings) + + +@pytest.mark.parametrize( + ("initial_settings", "expected_settings"), + ( + ( + {}, + { + "CLOSESPIDER_TIMEOUT_NO_ITEM": 600, + "DOWNLOADER_MIDDLEWARES": {MaxRequestsPerSeedDownloaderMiddleware: 100}, + "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue", + "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue", + "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue", + "ITEM_PROBABILITY_THRESHOLDS": { + "zyte_common_items.items.Article": 0.1, + "zyte_common_items.items.Product": 0.1, + }, + "DUD_LOAD_RULE_PATHS": RULE_PATHS, + "SCRAPY_POET_DISCOVER": [ + "zyte_spider_templates.pages", + ], + "SPIDER_MIDDLEWARES": { + IncrementalCrawlMiddleware: 45, + OffsiteRequestsPerSeedMiddleware: 49, + OnlyFeedsMiddleware: 108, + TrackNavigationDepthSpiderMiddleware: 110, + OffsiteMiddleware: None, + AllowOffsiteMiddleware: 500, + TrackSeedsSpiderMiddleware: 550, + CrawlingLogsMiddleware: 1000, + }, + "ITEM_PIPELINES": { + DropLowProbabilityItemPipeline: 0, + }, + "SPIDER_MODULES": [ + "zyte_spider_templates.spiders", + ], + }, + ), + ), +) +@pytest.mark.skipif( + version.parse(scrapy.__version__) >= version.parse("2.11.2"), + reason="Test applicable only for Scrapy versions < 2.11.2", +) +def test_poet_setting_changes(initial_settings, expected_settings): + _test_setting_changes(initial_settings, expected_settings) diff --git a/tests/test_article.py b/tests/test_article.py new file mode 100644 index 0000000..19a01c1 --- /dev/null +++ b/tests/test_article.py @@ -0,0 +1,606 @@ +from typing import Tuple, Type, cast +from unittest.mock import patch + +import pytest +import requests +import scrapy +from pydantic import ValidationError +from scrapy.statscollectors import StatsCollector +from scrapy_poet import DummyResponse +from scrapy_spider_metadata import get_spider_metadata +from zyte_common_items import ( + Article, + ArticleNavigation, + ProbabilityMetadata, + ProbabilityRequest, + Request, +) + +from zyte_spider_templates._geolocations import ( + GEOLOCATION_OPTIONS, + GEOLOCATION_OPTIONS_WITH_CODE, + Geolocation, +) +from zyte_spider_templates.params import ExtractFrom +from zyte_spider_templates.spiders.article import ( + ArticleCrawlStrategy, + ArticleSpider, + RequestType, +) + +from . import get_crawler +from .utils import assertEqualSpiderMetadata + + +@pytest.mark.parametrize( + "input_data, expected_exception", + ( + ({"url": "https://example.com"}, (None, "")), + ({"urls_file": "https://example.com/list.txt"}, (None, "")), + ( + { + "url": "https://example.com", + "crawl_strategy": ArticleCrawlStrategy.full, + }, + (None, ""), + ), + ({"url": "https://example.com", "crawl_strategy": "full"}, (None, "")), + ({"url": "https://example.com", "max_requests_per_seed": 1000}, (None, "")), + ({"url": "https://example.com", "max_requests_per_seed": 0}, (None, "")), + ( + {"url": "https://example.com", "max_requests_per_seed": -1}, + ( + ValidationError, + ("max_requests_per_seed\n Input should be greater than or equal to 0"), + ), + ), + ( + {"url": "https://example.com", "extract_from": ExtractFrom.browserHtml}, + (None, ""), + ), + ({"url": "https://example.com", "extract_from": "browserHtml"}, (None, "")), + ( + { + "some_field": "https://a.example.com\nhttps://b.example.com\nhttps://c.example.com" + }, + ( + ValidationError, + "Value error, No input parameter defined. Please, define one of: url, urls...", + ), + ), + ( + {"url": {"url": "https://example.com"}}, + (ValidationError, "url\n Input should be a valid string"), + ), + ( + {"url": ["https://example.com"]}, + (ValidationError, "url\n Input should be a valid string"), + ), + ( + {"urls": "https://example.com", "crawl_strategy": "unknown"}, + (ValidationError, "crawl_strategy\n Input should be 'full'"), + ), + ( + {"url": "https://example.com", "urls_file": "https://example.com/list.txt"}, + (ValidationError, "Value error, Expected a single input parameter, got 2:"), + ), + ({"url": "wrong_url"}, (ValidationError, "url\n String should match pattern")), + ), +) +def test_parameters(input_data, expected_exception: Tuple[Type[Exception], str]): + exception: Type[Exception] = expected_exception[0] + if exception: + with pytest.raises(exception) as e: + ArticleSpider(**input_data) + assert expected_exception[1] in str(e) + else: + ArticleSpider(**input_data) + + +def test_crawl_strategy_direct_item(): + crawler = get_crawler() + spider = ArticleSpider.from_crawler( + crawler, + url="https://example.com", + crawl_strategy="direct_item", + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].callback == cast(ArticleSpider, spider).parse_dynamic + assert start_requests[0].url == "https://example.com" + assert start_requests[0].meta["request_type"] == RequestType.ARTICLE + assert start_requests[0].meta["crawling_logs"]["name"] == "[article]" + assert start_requests[0].meta["crawling_logs"]["page_type"] == "article" + assert start_requests[0].meta["crawling_logs"]["probability"] == 1.0 + + +def test_arguments(): + crawler = get_crawler() + base_kwargs = {"url": "https://example.com"} + ArticleSpider.from_crawler(crawler, **base_kwargs) + + for param, arg, setting, old_setting_value, getter_name, new_setting_value in ( + ("max_requests", "123", "ZYTE_API_MAX_REQUESTS", None, "getint", 123), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ( + "extract_from", + "browserHtml", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + { + "articleOptions": {"extractFrom": "browserHtml"}, + "articleNavigationOptions": {"extractFrom": "browserHtml"}, + }, + ), + ( + "extract_from", + "httpResponseBody", + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + { + "articleOptions": {"extractFrom": "httpResponseBody"}, + "articleNavigationOptions": {"extractFrom": "httpResponseBody"}, + "geolocation": "US", + }, + ), + ( + "extract_from", + None, + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + {"geolocation": "US"}, + ), + ): + kwargs = {param: arg} + settings = {} + if old_setting_value is not None: + settings[setting] = old_setting_value + crawler = get_crawler(settings=settings) + ArticleSpider.from_crawler(crawler, **kwargs, **base_kwargs) + getter = getattr(crawler.settings, getter_name) + assert getter(setting) == new_setting_value + + +def test_init_input_with_urls_file(): + crawler = get_crawler() + url = "https://example.com" + + with patch("zyte_spider_templates.spiders.article.requests.get") as mock_get: + response = requests.Response() + response._content = ( + b"https://a.example\n \nhttps://b.example\nhttps://c.example\n\n" + ) + mock_get.return_value = response + spider = ArticleSpider.from_crawler(crawler, urls_file=url) + mock_get.assert_called_with(url) + + start_requests = list(spider.start_requests()) + assert len(start_requests) == 3 + assert start_requests[0].url == "https://a.example" + assert start_requests[1].url == "https://b.example" + assert start_requests[2].url == "https://c.example" + + +def test_init_input_without_urls_file(): + crawler = get_crawler() + base_kwargs = {"url": "https://example.com"} + spider = ArticleSpider.from_crawler(crawler, **base_kwargs) + cast(ArticleSpider, spider)._init_input() + + assert spider.start_urls == ["https://example.com"] + + +def test_metadata(): + actual_metadata = get_spider_metadata(ArticleSpider, normalize=True) + expected_metadata = { + "template": True, + "title": "Article", + "description": "Template for spiders that extract article data from news or blog websites.", + "param_schema": { + "groups": [ + { + "description": "Input data that determines the " + "start URLs of the crawl.", + "id": "inputs", + "title": "Inputs", + "widget": "exclusive", + } + ], + "properties": { + "url": { + "default": "", + "description": "Initial URL for the " + "crawl. Enter the full " + "URL including " + "http(s), you can copy " + "and paste it from " + "your browser. " + "Example: " + "https://toscrape.com/", + "exclusiveRequired": True, + "group": "inputs", + "pattern": "^https?://[^:/\\s]+(:\\d{1,5})?(/[^\\s]*)*(#[^\\s]*)?$", + "title": "URL", + "type": "string", + }, + "urls": { + "anyOf": [ + {"items": {"type": "string"}, "type": "array"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "Initial URLs for the crawl, separated by new lines. Enter the " + "full URL including http(s), you can copy and paste it from your " + "browser. Example: https://toscrape.com/" + ), + "exclusiveRequired": True, + "group": "inputs", + "title": "URLs", + "widget": "textarea", + }, + "urls_file": { + "default": "", + "description": ( + "URL that point to a plain-text file with a list of URLs to " + "crawl, e.g. https://example.com/url-list.txt. The linked file " + "must contain 1 URL per line." + ), + "exclusiveRequired": True, + "group": "inputs", + "pattern": "^https?://[^:/\\s]+(:\\d{1,5})?(/[^\\s]*)*(#[^\\s]*)?$", + "title": "URLs file", + "type": "string", + }, + "incremental": { + "default": False, + "description": ( + "Skip items with URLs already stored in the specified Zyte Scrapy Cloud Collection. " + "This feature helps avoid reprocessing previously crawled items and requests by comparing " + "their URLs against the stored collection." + ), + "title": "Incremental", + "type": "boolean", + }, + "incremental_collection_name": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "description": "Name of the Zyte Scrapy Cloud Collection used during an incremental crawl." + "By default, a Collection named after the spider (or virtual spider) is used, " + "meaning that matching URLs from previous runs of the same spider are skipped, " + "provided those previous runs had `incremental` argument set to `true`." + "Using a different collection name makes sense, for example, in the following cases:" + "- different spiders share a collection." + "- the same spider uses different collections (e.g., for development runs vs. production runs).", + "title": "Incremental Collection Name", + }, + "crawl_strategy": { + "default": "full", + "description": ( + "Determines how input URLs and follow-up URLs are crawled." + ), + "enumMeta": { + "direct_item": { + "description": ( + "Treat input URLs as direct links to " + "articles, and extract an article from each." + ), + "title": "Direct URLs to Articles", + }, + "full": { + "description": ( + "Follow most links within each domain from the list of URLs in an " + "attempt to discover and extract as many articles as possible." + ), + "title": "Full", + }, + }, + "title": "Crawl Strategy", + "enum": ["full", "direct_item"], + "type": "string", + }, + "geolocation": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": ("Country of the IP addresses to use."), + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in sorted(Geolocation) + }, + "title": "Geolocation", + "enum": list( + sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__) + ), + }, + "max_requests": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 100, + "description": ( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + "title": "Max Requests", + "widget": "request-limit", + }, + "max_requests_per_seed": { + "anyOf": [{"minimum": 0, "type": "integer"}, {"type": "null"}], + "default": None, + "description": ( + "The maximum number of follow-up requests allowed per " + "initial URL. Unlimited if not set." + ), + "title": "Max requests per seed", + }, + "extract_from": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "description": ( + "Whether to perform extraction using a browser request " + "(browserHtml) or an HTTP request (httpResponseBody)." + ), + "enumMeta": { + "browserHtml": { + "description": "Use browser rendering. Better quality, but slower and more expensive.", + "title": "browserHtml", + }, + "httpResponseBody": { + "description": "Use raw responses. Fast and cheap.", + "title": "httpResponseBody", + }, + }, + "title": "Extraction source", + "enum": ["httpResponseBody", "browserHtml"], + }, + }, + "title": "ArticleSpiderParams", + "type": "object", + }, + } + assertEqualSpiderMetadata(actual_metadata, expected_metadata) + + geolocation = actual_metadata["param_schema"]["properties"]["geolocation"] + assert geolocation["enum"][0] == "AF" + assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"} + assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) + + +def test_crawl(): + url = "https://example.com/article_list" + article_url = "https://example.com/article_page" + item_urls = [ + ("https://example.com/link_1", "article", 0.5), + ("https://example.com/link_2", "article", 0.5), + ("https://example.com/link_4", "feed items", 0.99), + ] + subcategory_urls = [ + ("https://example.com/link_1", "subCategories", 0.5), + ("https://example.com/link_2", "subCategories", 0.5), + ("https://example.com/link_3", "feed", 1.0), + ] + + article_navigation_items = { + "items": [ + ProbabilityRequest( + url=item_url, + name=f"[heuristics][articleNavigation][{heuristic_name}] {item_url.split('/')[-1]}", + metadata=ProbabilityMetadata(probability=probability), + ) + for item_url, heuristic_name, probability in item_urls + ], + "subCategories": [ + ProbabilityRequest( + url=subcategory_url, + name=f"[heuristics][articleNavigation][{heuristic_name}] {subcategory_url.split('/')[-1]}", + metadata=ProbabilityMetadata(probability=probability), + ) + for subcategory_url, heuristic_name, probability in subcategory_urls + ], + } + + crawler = get_crawler() + crawler.stats = StatsCollector(crawler) + spider = ArticleSpider.from_crawler(crawler, url=url) + + # start_requests -> get_seed_request + requests = list(spider.start_requests()) + requests[0].meta["crawling_logs"] = { + "name": "[seed]", + "page_type": "articleNavigation", + "probability": None, + } + requests[0].meta["page_type"] = "articleNavigation" + requests[0].meta["request_type"] = RequestType.SEED + + # parse_navigation + request = scrapy.Request(url=url) + response = DummyResponse(url=request.url, request=request) + navigation = ArticleNavigation( + url="", + items=article_navigation_items["items"], + subCategories=article_navigation_items["subCategories"], + ) + requests = list( + cast(ArticleSpider, spider).parse_dynamic( + response, {ArticleNavigation: navigation} + ) + ) + assert requests[2].url == "https://example.com/link_4" + assert requests[2].meta["request_type"] == RequestType.ARTICLE + assert ( + requests[2].meta["crawling_logs"]["name"] + == "[heuristics][articleNavigation][feed items] link_4" + ) + assert requests[2].meta["crawling_logs"]["page_type"] == "article" + assert requests[2].meta["crawling_logs"]["probability"] == 0.99 + assert requests[2].callback == cast(ArticleSpider, spider).parse_dynamic + + assert requests[5].url == "https://example.com/link_3" + assert requests[5].meta["request_type"] == RequestType.NAVIGATION + assert ( + requests[5].meta["crawling_logs"]["name"] + == "[heuristics][articleNavigation][feed] link_3" + ) + assert requests[5].meta["crawling_logs"]["page_type"] == "subCategories" + assert requests[5].meta["crawling_logs"]["probability"] == 1.0 + assert requests[5].callback == cast(ArticleSpider, spider).parse_dynamic + + assert requests[0].url == "https://example.com/link_1" + assert requests[0].meta["request_type"] == RequestType.ARTICLE_AND_NAVIGATION + assert ( + requests[0].meta["crawling_logs"]["name"] == "[article or subCategories] link_1" + ) + assert ( + requests[0].meta["crawling_logs"]["page_type"] == "articleNavigation-heuristics" + ) + assert requests[0].meta["crawling_logs"]["probability"] == 0.5 + assert requests[0].callback == cast(ArticleSpider, spider).parse_dynamic + + assert requests[1].url == "https://example.com/link_2" + assert requests[1].meta["request_type"] == RequestType.ARTICLE_AND_NAVIGATION + assert ( + requests[1].meta["crawling_logs"]["name"] == "[article or subCategories] link_2" + ) + assert ( + requests[1].meta["crawling_logs"]["page_type"] == "articleNavigation-heuristics" + ) + assert requests[1].meta["crawling_logs"]["probability"] == 0.5 + assert requests[1].callback == cast(ArticleSpider, spider).parse_dynamic + + # parse_article + request = scrapy.Request(url=url) + response = DummyResponse(url=request.url, request=request) + article = Article(url=article_url) + assert ( + article + == list( + cast(ArticleSpider, spider).parse_dynamic(response, {Article: article}) + )[0] + ) + + # parse article_and_navigation + request = scrapy.Request(url=url) + response = DummyResponse(url=request.url, request=request) + article = Article(url=article_url) + navigation = ArticleNavigation( + url="", + items=article_navigation_items["items"], + subCategories=article_navigation_items["subCategories"], + ) + requests = list( + cast(ArticleSpider, spider).parse_dynamic( + response, {Article: article, ArticleNavigation: navigation} + ) + ) + + assert requests[0] == article + assert requests[1].url == "https://example.com/link_1" + assert requests[2].url == "https://example.com/link_2" + assert requests[3].url == "https://example.com/link_4" + assert requests[4].url == "https://example.com/link_1" + assert requests[5].url == "https://example.com/link_2" + assert requests[6].url == "https://example.com/link_3" + + # parse_article_and_navigation with article, with next_page and with items + request = scrapy.Request(url=url) + response = DummyResponse(url=request.url, request=request) + article = Article(url=article_url) + navigation = ArticleNavigation( + url="", + items=article_navigation_items["items"], + subCategories=article_navigation_items["subCategories"], + nextPage=Request(url="https://example.com/next_page", name="nextPage"), + ) + requests = list( + cast(ArticleSpider, spider).parse_dynamic( + response, {Article: article, ArticleNavigation: navigation} + ) + ) + + assert requests[0] == article + assert requests[1].url == "https://example.com/next_page" + assert requests[1].meta["request_type"] == RequestType.NEXT_PAGE + assert requests[2].url == "https://example.com/link_1" + assert requests[3].url == "https://example.com/link_2" + assert requests[4].url == "https://example.com/link_4" + assert requests[5].url == "https://example.com/link_1" + assert requests[6].url == "https://example.com/link_2" + assert requests[7].url == "https://example.com/link_3" + + # parse_article_and_navigation with article, with next_page and without items + request = scrapy.Request(url=url) + response = DummyResponse(url=request.url, request=request) + article = Article(url=article_url) + navigation = ArticleNavigation( + url="", + items=[], + subCategories=article_navigation_items["subCategories"], + nextPage=Request(url="https://example.com/next_page", name="nextPage"), + ) + requests = list( + cast(ArticleSpider, spider).parse_dynamic( + response, {Article: article, ArticleNavigation: navigation} + ) + ) + + assert requests[0] == article + assert requests[1].url == "https://example.com/link_1" + assert requests[2].url == "https://example.com/link_2" + assert requests[3].url == "https://example.com/link_3" diff --git a/tests/test_feeds.py b/tests/test_feeds.py new file mode 100644 index 0000000..86b2c22 --- /dev/null +++ b/tests/test_feeds.py @@ -0,0 +1,113 @@ +from typing import List, Union + +import pytest +from web_poet import ( + AnyResponse, + BrowserHtml, + BrowserResponse, + HttpResponse, + HttpResponseBody, + ResponseUrl, +) + +from zyte_spider_templates.feeds import get_feed_urls, parse_feed, unique_urls + + +@pytest.fixture +def sample_urls() -> List[str]: + return [ + "http://example.com", + "http://example.com/", + "https://example.com", + "https://example.com/", + "http://example.com/page", + "http://example.com/page/", + ] + + +def test_unique_urls(sample_urls): + unique_list = unique_urls(sample_urls) + assert len(unique_list) == 4 + + +def test_unique_urls_order(sample_urls): + unique_list = unique_urls(sample_urls) + expected_order = [ + "http://example.com", + "https://example.com", + "http://example.com/page", + "http://example.com/page/", + ] + assert unique_list == expected_order + + +@pytest.fixture +def sample_response_feed() -> Union[AnyResponse, HttpResponse, BrowserResponse]: + html_content = """ + + + + + + + RSS Feed + Atom Feed + + + """ + return HttpResponse( + url=ResponseUrl("http://example.com"), + body=HttpResponseBody(html_content.encode(encoding="utf-8")), + ) + + +def test_get_feed_urls(sample_response_feed): + feed_urls = get_feed_urls(sample_response_feed) + assert len(feed_urls) == 3 + assert "http://example.com/rss.xml" in feed_urls + assert "http://example.com/atom.xml" in feed_urls + assert "http://example.com/feed/rss.xml" in feed_urls + + +@pytest.fixture +def sample_response_feeds() -> Union[AnyResponse, HttpResponse, BrowserResponse]: + rss_content = """ + + + Sample RSS Feed + http://example.com/feed/rss.xml + This is a sample RSS feed + + Item 1 + http://example.com/item1 + Description of Item 1 + + + Item 2 + http://example.com/item2 + Description of Item 2 + + + Item 3 + http://example.com/item2 + Description of Item 3 + + + + """ + return HttpResponse( + url=ResponseUrl("http://example.com/feed/rss.xml"), + body=HttpResponseBody(rss_content.encode(encoding="utf-8")), + ) + + +@pytest.mark.parametrize("is_browser_response", [False, True]) +def test_parse_feed(sample_response_feeds, is_browser_response): + if is_browser_response: + sample_response_feeds = BrowserResponse( + url=ResponseUrl("http://example.com"), + html=BrowserHtml(str(sample_response_feeds.text)), + ) + feed_urls = parse_feed(sample_response_feeds) + expected_urls = ["http://example.com/item1", "http://example.com/item2"] + assert feed_urls == expected_urls diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index 11e76b4..7f86a6f 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -1,6 +1,15 @@ import pytest +from scrapy.link import Link +from web_poet import BrowserHtml, BrowserResponse, ResponseUrl -from zyte_spider_templates.heuristics import is_homepage, might_be_category +from zyte_spider_templates.heuristics import ( + classify_article_crawling_links, + classify_article_feed_links, + is_comments_article_feed, + is_feed_content, + is_homepage, + might_be_category, +) @pytest.mark.parametrize( @@ -117,3 +126,225 @@ def test_is_homepage_localization_bad(url): """ assert not is_homepage(url) assert not is_homepage(url + "/") + + +@pytest.mark.parametrize( + "url, expected_result", + [ + ("http://example.com/comments/feed", True), + ("http://example.com/?feed=comments-rss2", True), + ("http://example.com/article/comments/feed", True), + ("http://example.com/article/?feed=comments-rss2", True), + ("http://example.com/feed", False), + ("http://example.com/?feed=rss2", False), + ("http://example.com/article/feed", False), + ("http://example.com/article/?feed=rss2", False), + ], +) +def test_is_comments_article_feed(url, expected_result): + assert is_comments_article_feed(url) == expected_result + + +@pytest.mark.parametrize( + "links, expected_allowed_urls, expected_disallowed_urls", + [ + ( + [ + Link(url="http://example.com/article1", text="Article 1"), + Link(url="http://example.com/image.jpg", text="Image"), + Link(url="http://example.com/page.html", text="Page"), + Link(url="http://example.com/search", text="Search Page"), + Link(url="http://t.co", text="Social Media"), + ], + ["http://example.com/article1", "http://example.com/page.html"], + [ + "http://example.com/image.jpg", + "http://t.co", + "http://example.com/search", + ], + ) + ], +) +def test_classify_article_crawling_links( + links, expected_allowed_urls, expected_disallowed_urls +): + allowed_links, disallowed_links = classify_article_crawling_links(links) + + assert len(allowed_links) == len(expected_allowed_urls) + assert len(disallowed_links) == len(expected_disallowed_urls) + + for url in expected_allowed_urls: + assert any(link.url == url for link in allowed_links) + + for url in expected_disallowed_urls: + assert any(link.url == url for link in disallowed_links) + + +@pytest.mark.parametrize( + "links, expected_allowed_urls, expected_disallowed_urls", + [ + ( + [ + Link(url="http://example.com/article1", text="Article 1"), + Link(url="http://example.com/feed/rss.xml", text="RSS Feed"), + Link(url="http://example.com/feed/atom.xml", text="Atom Feed"), + Link(url="http://example.com/comments/feed", text="Comments Feed"), + Link(url="http://example.com/page.html", text="Page"), + ], + [ + "http://example.com/article1", + "http://example.com/feed/rss.xml", + "http://example.com/feed/atom.xml", + "http://example.com/page.html", + ], + ["http://example.com/comments/feed"], + ) + ], +) +def test_classify_article_feed_links( + links, expected_allowed_urls, expected_disallowed_urls +): + allowed_links, disallowed_links = classify_article_feed_links(links) + + assert len(allowed_links) == len(expected_allowed_urls) + assert len(disallowed_links) == len(expected_disallowed_urls) + + assert set(link.url for link in allowed_links) == set(expected_allowed_urls) + assert set(link.url for link in disallowed_links) == set(expected_disallowed_urls) + + +def test_is_feed_content_rss(): + rss_content = """ + + + Example Feed + http://example.com/ + Example feed description + + Example entry + http://example.com/entry + Example entry description + + + """ + assert is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(rss_content) + ) + ) + + empty_rss_content = """ + + + + + + + """ + assert is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(empty_rss_content) + ) + ) + + wrong_rss_content = rss_content.replace("channel", "some_channel") + assert not is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(wrong_rss_content) + ) + ) + + +def test_is_feed_content_rdf(): + rdf_content = """ + + + Example Feed + http://example.com/ + Example feed description + + + + + + + + + Example entry + http://example.com/entry + Example entry description + + """ + assert is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(rdf_content) + ) + ) + + empty_rdf_content = """ + + + + + + + """ + assert is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(empty_rdf_content) + ) + ) + + wrong_rdf_content = rdf_content.replace("channel", "some_channel") + assert not is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(wrong_rdf_content) + ) + ) + + +def test_is_feed_content_atom(): + atom_content = """ + + Example Feed + + 2024-08-01T00:00:00Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + Example entry + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2024-08-01T00:00:00Z + Example entry description + + """ + assert is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(atom_content) + ) + ) + + empty_atom_content = """ + + + + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + """ + assert is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(empty_atom_content) + ) + ) + + wrong_atom_content = atom_content.replace("id", "some_id") + assert not is_feed_content( + BrowserResponse( + ResponseUrl("https://www.example.com"), BrowserHtml(wrong_atom_content) + ) + ) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 9b808bb..9aa4a7b 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1,14 +1,26 @@ +import logging +from collections import defaultdict +from typing import Iterable, Union +from unittest.mock import MagicMock + import pytest from freezegun import freeze_time from scrapy import Spider +from scrapy.exceptions import IgnoreRequest, NotConfigured from scrapy.http import Request, Response +from scrapy.settings import Settings from scrapy.statscollectors import StatsCollector from scrapy.utils.misc import create_instance from scrapy.utils.test import get_crawler +from scrapy_poet import DynamicDeps +from zyte_common_items import Article, Item, Product from zyte_spider_templates.middlewares import ( AllowOffsiteMiddleware, CrawlingLogsMiddleware, + MaxRequestsPerSeedDownloaderMiddleware, + OffsiteRequestsPerSeedMiddleware, + TrackSeedsSpiderMiddleware, ) @@ -118,6 +130,39 @@ def test_crawling_logs_middleware(): }, }, ) + article_request = Request( + "https://example.com/article_1", + priority=10, + meta={ + "crawling_logs": { + "name": "Article 1", + "probability": 0.1, + "page_type": "article", + }, + }, + ) + article_navigation_request = Request( + "https://example.com/article_navigation_1", + priority=10, + meta={ + "crawling_logs": { + "name": "Article Navigation 1", + "probability": 0.1, + "page_type": "articleNavigation", + }, + }, + ) + article_navigation_heuristics_request = Request( + "https://example.com/article_and_navigation_1", + priority=10, + meta={ + "crawling_logs": { + "name": "Article And Navigation 1", + "probability": 0.1, + "page_type": "articleNavigation-heuristics", + }, + }, + ) custom_request = Request( "https://example.com/custom-page-type", meta={ @@ -142,6 +187,12 @@ def test_crawling_logs_middleware(): product_navigation_heuristics_request ) custom_request_fp = request_fingerprint(custom_request) + article_request_fp = request_fingerprint(article_request) + article_navigation_request_fp = request_fingerprint(article_navigation_request) + article_navigation_heuristics_request_fp = request_fingerprint( + article_navigation_heuristics_request + ) + custom_request_fp = request_fingerprint(custom_request) unknown_request_fp = request_fingerprint(unknown_request) def results_gen(): @@ -150,6 +201,9 @@ def results_gen(): yield subcategory_request yield product_navigation_request yield product_navigation_heuristics_request + yield article_request + yield article_navigation_request + yield article_navigation_heuristics_request yield custom_request yield unknown_request @@ -162,6 +216,9 @@ def results_gen(): "- subCategories: 1\n" "- productNavigation: 1\n" "- productNavigation-heuristics: 1\n" + "- article: 1\n" + "- articleNavigation: 1\n" + "- articleNavigation-heuristics: 1\n" "- some other page_type: 1\n" "- unknown: 1\n" "Structured Logs:\n" @@ -225,6 +282,36 @@ def results_gen(): f' "request_fingerprint": "{product_navigation_heuristics_request_fp}"\n' " }\n" " ],\n" + ' "article": [\n' + " {\n" + ' "name": "Article 1",\n' + ' "probability": 0.1,\n' + ' "page_type": "article",\n' + ' "request_url": "https://example.com/article_1",\n' + ' "request_priority": 10,\n' + f' "request_fingerprint": "{article_request_fp}"\n' + " }\n" + " ],\n" + ' "articleNavigation": [\n' + " {\n" + ' "name": "Article Navigation 1",\n' + ' "probability": 0.1,\n' + ' "page_type": "articleNavigation",\n' + ' "request_url": "https://example.com/article_navigation_1",\n' + ' "request_priority": 10,\n' + f' "request_fingerprint": "{article_navigation_request_fp}"\n' + " }\n" + " ],\n" + ' "articleNavigation-heuristics": [\n' + " {\n" + ' "name": "Article And Navigation 1",\n' + ' "probability": 0.1,\n' + ' "page_type": "articleNavigation-heuristics",\n' + ' "request_url": "https://example.com/article_and_navigation_1",\n' + ' "request_priority": 10,\n' + f' "request_fingerprint": "{article_navigation_heuristics_request_fp}"\n' + " }\n" + " ],\n" ' "some other page_type": [\n' " {\n" ' "name": "Custom Page",\n' @@ -280,8 +367,994 @@ class TestSpider(Spider): middleware = AllowOffsiteMiddleware(stats) middleware.spider_opened(spider) - result = list(middleware.process_spider_output(Response(""), [req], spider)) - if allowed: - assert result == [req] + assert middleware.should_follow(req, spider) == allowed + + +@pytest.fixture +def mock_crawler(): + mock_settings = MagicMock(spec=Settings) + mock_crawler = MagicMock(spec=["spider", "settings"]) + mock_crawler.settings = mock_settings + return mock_crawler + + +@pytest.mark.parametrize("max_requests_per_seed", [10, "10", -10, False, None]) +def test_middleware_init(mock_crawler, max_requests_per_seed): + class TestSpider(Spider): + name = "test" + allowed_domains = ("example.com",) + settings = Settings({"MAX_REQUESTS_PER_SEED": max_requests_per_seed}) + + crawler = get_crawler(TestSpider) + crawler.spider = TestSpider() + if max_requests_per_seed not in [-10, False, None]: + middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + assert middleware.max_requests_per_seed == int(max_requests_per_seed) else: - assert result == [] + with pytest.raises(NotConfigured): + MaxRequestsPerSeedDownloaderMiddleware(crawler) + + +@pytest.mark.parametrize( + "seed, requests_per_seed, max_requests_per_seed, expected_result", + [ + ( + "http://example.com", + 2, + 5, + False, + ), # Request count below max_requests_per_seed + ( + "http://example.com", + 2, + 2, + True, + ), # Request count equal to max_requests_per_seed + ( + "http://example.com", + 5, + 2, + True, + ), # Request count above to max_requests_per_seed + ], +) +def test_max_requests_per_seed_reached( + mock_crawler, seed, requests_per_seed, max_requests_per_seed, expected_result +): + mock_crawler.spider.settings = Settings( + {"MAX_REQUESTS_PER_SEED": max_requests_per_seed} + ) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(mock_crawler) + downloader_middleware.requests_per_seed = defaultdict() + downloader_middleware.requests_per_seed[seed] = requests_per_seed + + assert downloader_middleware.max_requests_per_seed_reached(seed) == expected_result + + +def _get_seed_crawler(): + class TestSpiderSeed(Spider): + name = "test_seed" + + crawler = get_crawler(TestSpiderSeed) + crawler.spider = TestSpiderSeed() + crawler.spider.settings = Settings({"MAX_REQUESTS_PER_SEED": 2}) + return crawler + + +def test_process_request(): + request_url_1 = "https://example.com/1" + request_url_2 = "https://example.com/2" + request_url_3 = "https://example.com/3" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + request: Union[Request, Item] + + request = Request(url=request_url_1) + request_gen = spider_middleware.process_start_requests([request], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] == request_url_1 + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {request_url_1: 1} + + response = Response(url=request_url_1, request=request) + request = Request(url=request_url_2) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == request_url_1 + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {request_url_1: 2} + + # After reaching the max request for the given seed, requests would be filtered. + response = Response(url=request_url_2, request=request) + request = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == request_url_1 + with pytest.raises(IgnoreRequest): + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {request_url_1: 2} + + +def test_process_request_seed_override(): + """This tests the scenario when the 'seed' in the request.meta is overridden in the + start_requests() method. + """ + + request_url_1 = "https://example.com/1" + request_url_2 = "https://example.com/2" + request_url_3 = "https://example.com/3" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + request: Union[Request, Item] + + seed = "some non-url key" + + request = Request(url=request_url_1, meta={"seed": seed}) + request_gen = spider_middleware.process_start_requests([request], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 1} + + response = Response(url=request_url_1, request=request) + request = Request(url=request_url_2) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + # After reaching the max request for the given seed, requests would be filtered. + + response = Response(url=request_url_2, request=request) + request = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed + with pytest.raises(IgnoreRequest): + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + +def test_process_request_seed_override_2(): + """Similar to test_process_request_seed_override() but instead of overridding the + 'seed' value in the start_requests(), it's in one of the callbacks. + """ + + request_url_1 = "https://example.com/1" + request_url_2 = "https://example.com/2" + request_url_3 = "https://example.com/3" + request_url_4 = "https://example.com/4" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + request: Union[Request, Item] + + seed_1 = "some non-url key" + seed_2 = "another non-url key" + + request_1 = Request(url=request_url_1, meta={"seed": seed_1}) + request_gen = spider_middleware.process_start_requests([request_1], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] == seed_1 + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed_1: 1} + + # This request coming from a callback uses a new seed value. + + response = Response(url=request_url_1, request=request_1) + request_2 = Request(url=request_url_2, meta={"seed": seed_2}) + request_gen = spider_middleware.process_spider_output( + response, [request_2], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed_2 + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed_1: 1, seed_2: 1} + + response = Response(url=request_url_2, request=request_2) + request_3 = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request_3], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed_2 + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed_1: 1, seed_2: 2} + + # After reaching the max request for the 2nd seed, requests would be filtered. + + response = Response(url=request_url_3, request=request_3) + request_4 = Request(url=request_url_4) + request_gen = spider_middleware.process_spider_output( + response, [request_4], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed_2 + with pytest.raises(IgnoreRequest): + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed_1: 1, seed_2: 2} + + +def test_process_request_seed_override_multiple(): + """Similar to test_process_request_seed_override() but multiple start requests + point to the same seed. + """ + + request_url_1 = "https://us.example.com/1" + request_url_2 = "https://fr.example.com/1" + request_url_3 = "https://us.example.com/2" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + request: Union[Request, Item] + + seed = "some non-url key" + + request = Request(url=request_url_1, meta={"seed": seed}) + request_gen = spider_middleware.process_start_requests([request], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 1} + + request = Request(url=request_url_2, meta={"seed": seed}) + request_gen = spider_middleware.process_start_requests([request], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + # After reaching the max request for the given seed, requests would be filtered. + + response = Response(url=request_url_1, request=request) + request = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed + with pytest.raises(IgnoreRequest): + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + +def test_process_request_seed_override_downstream(): + """This tests the scenario when the 'seed' in the request.meta is overridden in the + start_requests() method, but an unrelated request downstream from another domain + uses the same custom 'seed' value. + """ + + request_url_1 = "https://example.com/1" + request_url_2 = "https://another-example.com/1" + request_url_3 = "https://example.com/2" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + request: Union[Request, Item] + + seed = "some non-url key" + + request = Request(url=request_url_1, meta={"seed": seed}) + request_gen = spider_middleware.process_start_requests([request], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 1} + + response = Response(url=request_url_1, request=request) + request = Request(url=request_url_2, meta={"seed": seed}) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + # After reaching the max request for the given seed, requests would be filtered. + + response = Response(url=request_url_2, request=request) + request = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed + with pytest.raises(IgnoreRequest): + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + +def test_process_request_seed_override_remove(): + """This tests the scenario when the 'seed' in the request.meta is overridden in the + start_requests() method, but one of the request explicitly sets the 'seed' to None. + """ + + request_url_1 = "https://example.com/1" + request_url_2 = "https://example.com/2" # This one removes the 'seed' in meta + request_url_3 = "https://example.com/3" + request_url_4 = "https://example.com/4" + request_url_5 = "https://example.com/5" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + + seed = "some non-url key" + + request_1: Request = Request(url=request_url_1, meta={"seed": seed}) + request_gen = spider_middleware.process_start_requests([request_1], crawler.spider) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert request.meta["seed"] == seed + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 1} + + response = Response(url=request_url_1, request=request_1) + request_2: Union[Request, Item] = Request(url=request_url_2, meta={"seed": None}) + request_gen = spider_middleware.process_spider_output( + response, [request_2], crawler.spider + ) + request_2 = list(request_gen)[0] + assert isinstance(request_2, Request) + assert request_2.meta["seed"] is None + downloader_middleware.process_request(request_2, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 1} + + # A request coming from the request which sets 'seed' to None won't have a sticky + # 'seed' value. + + response = Response(url=request_url_1, request=request_2) + request_3: Union[Request, Item] = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request_3], crawler.spider + ) + request_3 = list(request_gen)[0] + assert isinstance(request_3, Request) + assert "seed" not in request_3.meta + downloader_middleware.process_request(request_3, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 1} + + # However, a request coming from the untampered 'seed' value would still have the + # stick 'seed' value. + + response = Response(url=request_url_1, request=request_1) + request_4: Union[Request, Item] = Request(url=request_url_4) + request_gen = spider_middleware.process_spider_output( + response, [request_4], crawler.spider + ) + request_4 = list(request_gen)[0] + assert isinstance(request_4, Request) + assert request_4.meta["seed"] == seed + downloader_middleware.process_request(request_4, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + # Eventually, the stick 'seed' would result in filtered requests + + response = Response(url=request_url_5, request=request_4) + request_5: Union[Request, Item] = Request(url=request_url_5) + request_gen = spider_middleware.process_spider_output( + response, [request_5], crawler.spider + ) + request_5 = list(request_gen)[0] + assert isinstance(request_5, Request) + assert request.meta["seed"] == seed + with pytest.raises(IgnoreRequest): + downloader_middleware.process_request(request_5, crawler.spider) + assert downloader_middleware.requests_per_seed == {seed: 2} + + +def test_process_request_seed_none(): + """The user sets the 'seed' meta to None in the start_requests() method. + + This essentially disables the middleware and the 'MAX_REQUESTS_PER_SEED' setting has + no effect. + """ + + request_url_1 = "https://example.com/1" + request_url_2 = "https://example.com/2" + request_url_3 = "https://example.com/3" + + crawler = _get_seed_crawler() + spider_middleware = TrackSeedsSpiderMiddleware(crawler) + downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + request_gen: Iterable[Union[Request, Item]] + request: Union[Request, Item] + + request = Request(url=request_url_1, meta={"seed": None}) + request_gen = spider_middleware.process_start_requests([request], crawler.spider) + request = list(request_gen)[0] + assert request.meta["seed"] is None + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {} + + response = Response(url=request_url_1, request=request) + request = Request(url=request_url_2) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert "seed" not in request.meta + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {} + + # Unlike the other tests above, the 3rd one pushes through since the + # 'MAX_REQUESTS_PER_SEED' setting takes no effect. + + response = Response(url=request_url_2, request=request) + request = Request(url=request_url_3) + request_gen = spider_middleware.process_spider_output( + response, [request], crawler.spider + ) + request = list(request_gen)[0] + assert isinstance(request, Request) + assert "seed" not in request.meta + downloader_middleware.process_request(request, crawler.spider) + assert downloader_middleware.requests_per_seed == {} + + +def test_offsite_requests_per_seed_middleware_not_configured(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + crawler.spider.settings = Settings({"OFFSITE_REQUESTS_PER_SEED_ENABLED": False}) + with pytest.raises(NotConfigured): + OffsiteRequestsPerSeedMiddleware(crawler) + + +def test_offsite_requests_per_seed_middleware(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + crawler.spider.settings = Settings({"OFFSITE_REQUESTS_PER_SEED_ENABLED": True}) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + + # no result + request = Request(url="https://example.com/1") + response = Response(url=request.url, request=request) + result = list(middleware.process_spider_output(response, [], crawler.spider)) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result == [] + assert middleware.allowed_domains_per_seed == {} + assert "seed_url" not in request.meta + + # is_seed_request: True, domain allowed + seed = "https://example.com/1" + request = Request( + url="https://example.com/1", meta={"is_seed_request": True, "seed": seed} + ) + item = Article(url="https://example.com/article") + response = Response(url=request.url, request=request) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == { + "https://example.com/1": {"example.com"} + } + assert request.meta["seed"] == request.url + + # "seed" in meta, domain allowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + middleware.allowed_domains_per_seed = defaultdict(set, {seed: {"example.com"}}) + request = Request(url="https://example.com/2", meta={"seed": seed}) + + response = Response(url=request.url, request=request) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + # "seed" in meta, domain disallowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + request = Request(url="https://example_1.com/1", meta={"seed": seed}) + response = Response(url=request.url, request=request) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") == 1 + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") == 1 + assert result == [item] + assert middleware.allowed_domains_per_seed == {} + assert request.meta["seed"] == seed + + # seed not in meta + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + request = Request(url="https://example.com/1") + + response = Response(url=request.url, request=request) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {} + assert "seed" not in request.meta + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + + # "seed" in meta, seed_domains are in meta, seed_domain allowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + request = Request( + url="https://example_1.com/1", + meta={"seed": seed, "seed_domains": {"example_1.com"}}, + ) + response = Response(url=request.url, request=request) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {seed: {"example_1.com"}} + assert request.meta["seed"] == seed + + # "seed" in meta, seed_domains are in meta, seed_domain disallowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + request = Request( + url="https://example_1.com/1", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + response = Response(url=request.url, request=request) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") == 1 + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") == 1 + assert result == [item] + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + # Offsite request - 1st offsite request is NOT filtered out to extract article + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + seed_request = Request( + url=seed, + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + seed_response = Response(url=seed, request=seed_request) + request = Request( + url="https://another-example.com", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + item = Article(url="https://another-example.com") + result = list( + middleware.process_spider_output(seed_response, [request, item], crawler.spider) + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") is None + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") is None + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + # Offsite request - consequent offsite request are filtered out + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + prev_request = Request( + url="https://another-example.com", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + prev_response = Response(url="https://another-example.com", request=prev_request) + request = Request( + url="https://another-example.com/page-2", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + result = list( + middleware.process_spider_output(prev_response, [request, item], crawler.spider) + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") == 1 + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") == 1 + assert result == [item] + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + +@pytest.mark.asyncio +async def result_as_async_gen(middleware, response, result, spider): + async def async_generator(): + for r in result: + yield r + + processed_result = [] + async for processed_request in middleware.process_spider_output_async( + response, async_generator(), spider + ): + processed_result.append(processed_request) + return processed_result + + +@pytest.mark.asyncio +async def test_offsite_requests_per_seed_middleware_async(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + crawler.spider.settings = Settings({"OFFSITE_REQUESTS_PER_SEED_ENABLED": True}) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + + # no result + request = Request(url="https://example.com/1") + response = Response(url=request.url, request=request) + result = await result_as_async_gen(middleware, response, [], crawler.spider) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result == [] + assert middleware.allowed_domains_per_seed == {} + assert "seed_url" not in request.meta + + # is_seed_request: True, domain allowed + seed = "https://example.com/1" + request = Request( + url="https://example.com/1", meta={"is_seed_request": True, "seed": seed} + ) + item = Article(url="https://example.com/article") + response = Response(url=request.url, request=request) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == { + "https://example.com/1": {"example.com"} + } + assert request.meta["seed"] == request.url + + # "seed" in meta, domain allowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + middleware.allowed_domains_per_seed = defaultdict(set, {seed: {"example.com"}}) + request = Request(url="https://example.com/2", meta={"seed": seed}) + + response = Response(url=request.url, request=request) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + # "seed" in meta, domain disallowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + request = Request(url="https://example_1.com/1", meta={"seed": seed}) + response = Response(url=request.url, request=request) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") == 1 + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") == 1 + assert result == [item] + assert middleware.allowed_domains_per_seed == {} + assert request.meta["seed"] == seed + + # seed not in meta + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + request = Request(url="https://example.com/1") + + response = Response(url=request.url, request=request) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {} + assert "seed" not in request.meta + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + + # "seed" in meta, seed_domains are in meta, seed_domain allowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + request = Request( + url="https://example_1.com/1", + meta={"seed": seed, "seed_domains": {"example_1.com"}}, + ) + response = Response(url=request.url, request=request) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert not crawler.stats.get_value("offsite_requests_per_seed/domains") + assert not crawler.stats.get_value("offsite_requests_per_seed/filtered") + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {seed: {"example_1.com"}} + assert request.meta["seed"] == seed + + # "seed" in meta, seed_domains are in meta, seed_domain disallowed + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + request = Request( + url="https://example_1.com/1", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + response = Response(url=request.url, request=request) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") == 1 + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") == 1 + assert result == [item] + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + # Offsite request - 1st offsite request is NOT filtered out to extract article + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + seed_request = Request( + url=seed, + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + seed_response = Response(url=seed, request=seed_request) + request = Request( + url="https://another-example.com", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + item = Article(url="https://another-example.com") + result = await result_as_async_gen( + middleware, seed_response, [request, item], crawler.spider + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") is None + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") is None + assert result[0] == request + assert result[1] == item + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + # Offsite request - consequent offsite request are filtered out + crawler.stats = StatsCollector(crawler) + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + seed = "https://example.com/1" + prev_request = Request( + url="https://another-example.com", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + prev_response = Response(url="https://another-example.com", request=prev_request) + request = Request( + url="https://another-example.com/page-2", + meta={"seed": seed, "seed_domains": {"example.com"}}, + ) + result = await result_as_async_gen( + middleware, prev_response, [request, item], crawler.spider + ) + assert crawler.stats.get_value("offsite_requests_per_seed/domains") == 1 + assert crawler.stats.get_value("offsite_requests_per_seed/filtered") == 1 + assert result == [item] + assert middleware.allowed_domains_per_seed == {seed: {"example.com"}} + assert request.meta["seed"] == seed + + +@pytest.mark.parametrize( + "meta, expected_is_seed_request, expected_seed", + [ + ({"is_seed_request": True}, True, "https://example.com/1"), + ({"is_seed_request": False, "seed": "test_seed"}, False, "test_seed"), + ({}, True, "https://example.com/1"), + ], +) +def test_track_seeds_process_start_requests( + meta, expected_is_seed_request, expected_seed +): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + middleware = TrackSeedsSpiderMiddleware(crawler) + start_request_url = "https://example.com/1" + start_request = Request(url=start_request_url, meta=meta) + result = list(middleware.process_start_requests([start_request], TestSpider())) + assert result[0].meta["is_seed_request"] == expected_is_seed_request + assert result[0].meta["seed"] == expected_seed + + +@pytest.mark.parametrize( + "input_response, expected_urls", + ( + ( + Response( + url="https://a.example", + request=Request( + url="https://a.example", + cb_kwargs={ + 0: DynamicDeps({Article: Article(url="https://a.example")}) # type: ignore[dict-item] + }, + ), + ), + {"a.example"}, + ), + ( + Response( + url="https://a.example", + request=Request( + url="https://a.example", + cb_kwargs={ + 0: DynamicDeps({Article: Article(url="https://b.example")}) # type: ignore[dict-item] + }, + ), + ), + {"a.example", "b.example"}, + ), + ( + Response( + url="https://a.example", + request=Request( + url="https://a.example", + cb_kwargs={ + 0: DynamicDeps( + { + Article: Article( + url="https://b.example", + canonicalUrl="https://b.example", + ) + } + ) # type: ignore[dict-item] + }, + ), + ), + {"a.example", "b.example"}, + ), + ( + Response( + url="https://a.example", + request=Request( + url="https://a.example", + cb_kwargs={ + 0: DynamicDeps( + { + Article: Article( + url="https://b.example", + canonicalUrl="https://c.example", + ) + } + ) # type: ignore[dict-item] + }, + ), + ), + {"a.example", "b.example", "c.example"}, + ), + ( + Response( + url="https://a.example", + request=Request( + url="https://b.example", + cb_kwargs={ + 0: DynamicDeps( + { + Article: Article( + url="https://c.example", + canonicalUrl="https://d.example", + ) + } + ) # type: ignore[dict-item] + }, + ), + ), + {"b.example", "c.example", "d.example"}, + ), + ( + Response( + url="https://a.example", + request=Request( + url="https://b.example", + cb_kwargs={ + 0: DynamicDeps( + { + Product: Product( + url="https://c.example", + canonicalUrl="https://d.example", + ) + } + ) # type: ignore[dict-item] + }, + ), + ), + {"b.example"}, + ), + ), +) +def test_get_allowed_domains(input_response, expected_urls, caplog): + class TestSpider(Spider): + name = "test" + + caplog.clear() + crawler = get_crawler() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + crawler.spider.settings = Settings({"OFFSITE_REQUESTS_PER_SEED_ENABLED": True}) + logging.getLogger().setLevel(logging.DEBUG) + + middleware = OffsiteRequestsPerSeedMiddleware(crawler) + result = middleware._get_allowed_domains(input_response) + + assert result == expected_urls + item = input_response.request.cb_kwargs[0] + if Article not in input_response.request.cb_kwargs[0]: + assert caplog.messages[-1] == f"This type of item: {type(item)} is not allowed" + + +def test_from_crawler(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + crawler.spider.settings = Settings({"OFFSITE_REQUESTS_PER_SEED_ENABLED": True}) + + assert isinstance( + OffsiteRequestsPerSeedMiddleware.from_crawler(crawler=crawler), + OffsiteRequestsPerSeedMiddleware, + ) diff --git a/tests/test_search.py b/tests/test_search.py index c4554a8..ad01dc3 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -1,5 +1,4 @@ import pytest -from pytest_twisted import ensureDeferred from web_poet import AnyResponse, BrowserResponse, HttpResponse, PageParams from zyte_spider_templates.pages.search_request_template import ( @@ -617,7 +616,7 @@ ), ), ) -@ensureDeferred +@pytest.mark.asyncio async def test_search_request_template(html, page_params, expected, caplog): caplog.clear() caplog.at_level("ERROR") @@ -646,7 +645,7 @@ async def test_search_request_template(html, page_params, expected, caplog): assert expected.get("body", b"") == (search_request.body or b"") -@ensureDeferred +@pytest.mark.asyncio async def test_search_request_template_browser(caplog): """Do not suggest using a browser request if that is already the case.""" caplog.clear() diff --git a/tests/test_utils.py b/tests/test_utils.py index 515693e..0faaf86 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,10 @@ import pytest -from zyte_spider_templates.utils import get_domain, load_url_list +from zyte_spider_templates.utils import ( + get_domain, + get_domain_fingerprint, + load_url_list, +) URL_TO_DOMAIN = ( ("https://example.com", "example.com"), @@ -47,3 +51,22 @@ def test_load_url_list(input_urls, expected): return with pytest.raises(expected): load_url_list(input_urls) + + +@pytest.mark.parametrize( + "url, expected_fingerprint", + [ + # No subdomain + ("https://example.com", "c300"), + # One subdomain + ("https://sub.example.com", "c35d"), + # Multiple subdomains + ("https://sub1.sub2.example.com", "c3c9"), + # No TLD (localhost or internal addresses) + ("http://localhost", "3300"), + # Complex TLD (e.g., .co.uk) and subdomains + ("https://sub.example.co.uk", "c35d"), + ], +) +def test_get_domain_fingerprint(url, expected_fingerprint): + assert get_domain_fingerprint(url) == expected_fingerprint diff --git a/tox.ini b/tox.ini index 576d792..27dcd0e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,19 +1,21 @@ [tox] -envlist = min,py38,py39,py310,py311,py312,mypy,linters,twine +envlist = min,py39,py310,py311,py312,mypy,linters,twine [testenv] deps = pytest + pytest-asyncio pytest-cov - pytest-twisted freezegun commands = py.test \ --cov-report=html:coverage-html \ + --doctest-modules \ --cov-report=html \ --cov-report=xml \ --cov=zyte_spider_templates \ - --reactor=asyncio \ + -vv \ + -m "not deprication_warning" \ {posargs:zyte_spider_templates tests} [testenv:min] @@ -25,12 +27,14 @@ deps = formasaurus==0.10.0 jmespath==0.9.5 pydantic==2.1 - requests==1.0.0 + requests==2.31.0 + scrapinghub==2.4.0 scrapy==2.11.0 scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 web-poet==0.17.1 + xtractmime==0.2.1 zyte-common-items==0.26.2 [testenv:mypy] diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py index 75bfbde..7986e92 100644 --- a/zyte_spider_templates/__init__.py +++ b/zyte_spider_templates/__init__.py @@ -1,3 +1,16 @@ +from ._incremental.middleware import IncrementalCrawlMiddleware +from .middlewares import ( + AllowOffsiteMiddleware, + CrawlingLogsMiddleware, + MaxRequestsPerSeedDownloaderMiddleware, + OffsiteRequestsPerSeedMiddleware, + OnlyFeedsMiddleware, + TrackNavigationDepthSpiderMiddleware, + TrackSeedsSpiderMiddleware, +) +from .spiders.article import ArticleSpider from .spiders.base import BaseSpider, BaseSpiderParams from .spiders.ecommerce import EcommerceSpider from .spiders.serp import GoogleSearchSpider + +from ._addon import Addon # isort: skip diff --git a/zyte_spider_templates/_addon.py b/zyte_spider_templates/_addon.py new file mode 100644 index 0000000..3f63d36 --- /dev/null +++ b/zyte_spider_templates/_addon.py @@ -0,0 +1,170 @@ +from logging import getLogger +from typing import Any, List, Optional, Type + +from duplicate_url_discarder_rules import RULE_PATHS +from scrapy.settings import BaseSettings +from scrapy.utils.misc import load_object +from zyte_common_items.pipelines import DropLowProbabilityItemPipeline + +from zyte_spider_templates import ( + AllowOffsiteMiddleware, + CrawlingLogsMiddleware, + IncrementalCrawlMiddleware, + MaxRequestsPerSeedDownloaderMiddleware, + OffsiteRequestsPerSeedMiddleware, + OnlyFeedsMiddleware, + TrackNavigationDepthSpiderMiddleware, + TrackSeedsSpiderMiddleware, +) + +logger = getLogger(__name__) + + +def _extend_module_list(settings: BaseSettings, setting: str, item: str) -> None: + spider_modules: List[str] = settings.getlist(setting) + if item not in spider_modules: + spider_modules_priority = settings.getpriority(setting) + settings.set( + setting, + spider_modules + [item], + priority=spider_modules_priority, # type: ignore[arg-type] + ) + + +def _replace_builtin( + settings: BaseSettings, setting: str, builtin_cls: Type, new_cls: Type +) -> None: + setting_value = settings[setting] + if not setting_value: + logger.warning( + f"Setting {setting!r} is empty. Could not replace the built-in " + f"{builtin_cls} entry with {new_cls}. Add {new_cls} manually to " + f"silence this warning." + ) + return None + + if new_cls in setting_value: + return None + for cls_or_path in setting_value: + if isinstance(cls_or_path, str): + _cls = load_object(cls_or_path) + if _cls == new_cls: + return None + + builtin_entry: Optional[Any] = None + for _setting_value in (setting_value, settings[f"{setting}_BASE"]): + if builtin_cls in setting_value: + builtin_entry = builtin_cls + pos = _setting_value[builtin_entry] + break + for cls_or_path in setting_value: + if isinstance(cls_or_path, str): + _cls = load_object(cls_or_path) + if _cls == builtin_cls: + builtin_entry = cls_or_path + pos = _setting_value[builtin_entry] + break + if builtin_entry: + break + + if not builtin_entry: + logger.warning( + f"Settings {setting!r} and {setting + '_BASE'!r} are both " + f"missing built-in entry {builtin_cls}. Cannot replace it with {new_cls}. " + f"Add {new_cls} manually to silence this warning." + ) + return None + + if pos is None: + logger.warning( + f"Built-in entry {builtin_cls} of setting {setting!r} is disabled " + f"(None). Cannot replace it with {new_cls}. Add {new_cls} " + f"manually to silence this warning. If you had replaced " + f"{builtin_cls} with some other entry, you might also need to " + f"disable that other entry for things to work as expected." + ) + return + + settings[setting][builtin_entry] = None + settings[setting][new_cls] = pos + + +# https://github.com/scrapy-plugins/scrapy-zyte-api/blob/a1d81d11854b420248f38e7db49c685a8d46d943/scrapy_zyte_api/addon.py#L12 +def _setdefault(settings: BaseSettings, setting: str, cls: Type, pos: int) -> None: + setting_value = settings[setting] + if not setting_value: + settings[setting] = {cls: pos} + return None + if cls in setting_value: + return None + for cls_or_path in setting_value: + if isinstance(cls_or_path, str): + _cls = load_object(cls_or_path) + if _cls == cls: + return None + settings[setting][cls] = pos + + +class Addon: + def update_settings(self, settings: BaseSettings) -> None: + for setting, value in ( + ("CLOSESPIDER_TIMEOUT_NO_ITEM", 600), + ("SCHEDULER_DISK_QUEUE", "scrapy.squeues.PickleFifoDiskQueue"), + ("SCHEDULER_MEMORY_QUEUE", "scrapy.squeues.FifoMemoryQueue"), + ("SCHEDULER_PRIORITY_QUEUE", "scrapy.pqueues.DownloaderAwarePriorityQueue"), + ( + "ITEM_PROBABILITY_THRESHOLDS", + { + "zyte_common_items.items.Article": 0.1, + "zyte_common_items.items.Product": 0.1, + }, + ), + ("DUD_LOAD_RULE_PATHS", RULE_PATHS), + ): + settings.set(setting, value, priority="addon") + + _extend_module_list( + settings, "SCRAPY_POET_DISCOVER", "zyte_spider_templates.pages" + ) + _extend_module_list(settings, "SPIDER_MODULES", "zyte_spider_templates.spiders") + + _setdefault( + settings, + "DOWNLOADER_MIDDLEWARES", + MaxRequestsPerSeedDownloaderMiddleware, + 100, + ) + _setdefault(settings, "SPIDER_MIDDLEWARES", IncrementalCrawlMiddleware, 45) + _setdefault( + settings, "SPIDER_MIDDLEWARES", OffsiteRequestsPerSeedMiddleware, 49 + ) + _setdefault(settings, "SPIDER_MIDDLEWARES", TrackSeedsSpiderMiddleware, 550) + _setdefault(settings, "SPIDER_MIDDLEWARES", OnlyFeedsMiddleware, 108) + _setdefault( + settings, "SPIDER_MIDDLEWARES", TrackNavigationDepthSpiderMiddleware, 110 + ) + _setdefault(settings, "SPIDER_MIDDLEWARES", CrawlingLogsMiddleware, 1000) + _setdefault(settings, "ITEM_PIPELINES", DropLowProbabilityItemPipeline, 0) + + try: + from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware + except ImportError: + from scrapy.spidermiddlewares.offsite import ( # type: ignore[assignment] + OffsiteMiddleware, + ) + + _setdefault(settings, "SPIDER_MIDDLEWARES", OffsiteMiddleware, 500) + _replace_builtin( + settings, + "SPIDER_MIDDLEWARES", + OffsiteMiddleware, + AllowOffsiteMiddleware, + ) + else: + _setdefault(settings, "DOWNLOADER_MIDDLEWARES", OffsiteMiddleware, 500) + _replace_builtin( + settings, + "DOWNLOADER_MIDDLEWARES", + OffsiteMiddleware, + AllowOffsiteMiddleware, + ) diff --git a/zyte_spider_templates/_incremental/__init__.py b/zyte_spider_templates/_incremental/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/zyte_spider_templates/_incremental/manager.py b/zyte_spider_templates/_incremental/manager.py new file mode 100644 index 0000000..275c0b3 --- /dev/null +++ b/zyte_spider_templates/_incremental/manager.py @@ -0,0 +1,276 @@ +import asyncio +import logging +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Set, Tuple, Union + +import scrapinghub +from itemadapter import ItemAdapter +from scrapinghub.client.exceptions import Unauthorized +from scrapy import signals +from scrapy.crawler import Crawler +from scrapy.http.request import Request +from zyte_common_items import Item + +from zyte_spider_templates.utils import ( + get_project_id, + get_request_fingerprint, + get_spider_name, +) + +logger = logging.getLogger(__name__) + +INCREMENTAL_SUFFIX = "_incremental" +COLLECTION_API_URL = "https://storage.scrapinghub.com/collections" + +THREAD_POOL_EXECUTOR = ThreadPoolExecutor(max_workers=10) + + +class CollectionsFingerprintsManager: + def __init__(self, crawler: Crawler) -> None: + self.writer = None + self.collection = None + self.crawler = crawler + + self.batch: Set[Tuple[str, str]] = set() + self.batch_size = crawler.settings.getint("INCREMENTAL_CRAWL_BATCH_SIZE", 50) + + project_id = get_project_id(crawler) + collection_name = self.get_collection_name(crawler) + + self.init_collection(project_id, collection_name) + self.api_url = f"{COLLECTION_API_URL}/{project_id}/s/{collection_name}" + + logger.info( + f"Configuration of CollectionsFingerprintsManager for IncrementalCrawlMiddleware:\n" + f"batch_size: {self.batch_size},\n" + f"project: {project_id},\n" + f"collection_name: {collection_name}" + ) + + crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) + + def get_collection_name(self, crawler): + return ( + crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME") + or f"{get_spider_name(crawler)}{INCREMENTAL_SUFFIX}" + ) + + def init_collection(self, project_id, collection_name) -> None: + # auth is taken from SH_APIKEY or SHUB_JOBAUTH + client = scrapinghub.ScrapinghubClient() + collection = client.get_project(project_id).collections.get_store( + collection_name + ) + try: + # Trying to get a random key to make sure the collection exists. + collection.list(key=["init_key"]) + except scrapinghub.client.exceptions.NotFound as e: + if f"unknown collection {collection_name}" in str(e): + logger.info( + f"The collection: {collection_name} for {project_id=} doesn't exist" + f" and will be created automatically" + ) + # This trick forces the creation of a collection. + collection.set({"_key": "init", "value": "1"}) + collection.delete("init") + else: + logger.error(f"The error {e} for {project_id=}") + raise RuntimeError("incremental_crawling__not_found_exception") + except Unauthorized: + logger.error("The api key (SH_APIKEY or SHUB_JOBAUTH) is not valid.") + raise ValueError("incremental_crawling__api_key_not_vaild") + + self.collection = collection + self.writer = self.collection.create_writer() # type: ignore + + def save_to_collection(self, items_to_save) -> None: + """Saves the current batch of fingerprints to the collection.""" + items = [{"_key": key, "value": value} for key, value in items_to_save] + self.writer.write(items) # type: ignore + self.writer.flush() # type: ignore + + async def get_keys_from_collection_async(self, keys: Set[str]) -> Set[str]: + """Asynchronously fetches a set of keys from the collection using an executor to run in separate threads.""" + return await asyncio.get_event_loop().run_in_executor( + THREAD_POOL_EXECUTOR, lambda: self.get_keys_from_collection(keys) + ) + + async def read_batches(self, fingerprints: List[str], batch_start: int) -> Set[str]: + """Reads a specific batch of fingerprints and fetches corresponding keys asynchronously.""" + return await self.get_keys_from_collection_async( + set(fingerprints[batch_start : batch_start + self.batch_size]) + ) + + def get_keys_from_collection(self, keys: Set[str]) -> Set[str]: + """Synchronously fetches a set of keys from the collection.""" + return {item.get("_key", "") for item in self.collection.list(key=keys)} # type: ignore + + async def get_existing_fingerprints_async( + self, fingerprints: List[str] + ) -> Set[str]: + """Asynchronously checks for duplicate fingerprints in both the collection and the local buffer. + Async interaction with the collection could be replaced by + https://github.com/scrapinghub/python-scrapinghub/issues/169 in the future""" + + fingerprints_size = len(fingerprints) + + if fingerprints_size == 0: + return set() + + duplicated_fingerprints = set() + + tasks = [ + self.read_batches(fingerprints, i) + for i in range(0, fingerprints_size, self.batch_size) + ] + for future in asyncio.as_completed(tasks): + try: + batch_keys = await future + duplicated_fingerprints.update(batch_keys) + except Exception as e: + logging.error(f"Error while processing batch: {e}") + + # Check duplicates in the local buffer + local_duplicates = set(fingerprints) & {fp for fp, _ in self.batch} + duplicated_fingerprints.update(local_duplicates) + + return duplicated_fingerprints + + def add_to_batch(self, fp_url_map: Set[Tuple[str, str]]) -> None: + """ + Add the list of provided fingerprints and corresponding URLs per one item to the batch + """ + for fp_url in fp_url_map: + logger.debug(f"Adding fingerprint and URL ({fp_url}) to batch.") + self.crawler.stats.inc_value( # type: ignore[union-attr] + "incremental_crawling/fingerprint_url_to_batch" + ) + self.batch.add(fp_url) + if len(self.batch) >= self.batch_size: + self.save_batch() + self.crawler.stats.inc_value("incremental_crawling/add_to_batch") # type: ignore[union-attr] + + def save_batch(self) -> None: + if not self.batch: + return + logger.debug( + f"Saving {len(self.batch)} fingerprints to the Collection. " + f"The fingerprints are: {self.batch}." + ) + self.crawler.stats.inc_value("incremental_crawling/batch_saved") # type: ignore[union-attr] + self.save_to_collection(items_to_save=self.batch) + self.batch.clear() + + def spider_closed(self) -> None: + """Save fingerprints and corresponding URLs remaining in the batch, before spider closes.""" + self.save_batch() + + +class IncrementalCrawlingManager: + def __init__(self, crawler: Crawler, fm: CollectionsFingerprintsManager) -> None: + self.crawler = crawler + self.fm = fm + + async def process_incremental_async( + self, request: Request, result: List + ) -> List[Union[Request, Item]]: + """ + Processes the spider's parsing callbacks when IncrementalCrawlMiddleware is enabled. + + The function handles both requests and items returned by the spider. + - If an item is found: + - It saves the `request.url` and `item.url/item.canonicalURL` (if they differ) to the collection. + - If the result is a Request: + - It checks whether the request was processed previously. + - If it was processed, the request is removed from the result. + - If it was not, the request remains in the result. + """ + item: Optional[Item] = None + to_check = defaultdict(list) + fingerprint_to_url_map: Set[Tuple[str, str]] = set() + for i, element in enumerate(result): + if isinstance(element, Request): + # The requests are only checked to see if the links exist in the Collection + fp = get_request_fingerprint(self.crawler, element) + to_check[fp].append(i) + self.crawler.stats.inc_value("incremental_crawling/requests_to_check") # type: ignore[union-attr] + else: + if item: + raise NotImplementedError( + f"Unexpected number of returned items for {request.url}. " + f"None or one was expected." + ) + + item = element + unique_urls = self._get_unique_urls(request.url, item) + for url, url_field in unique_urls.items(): + fp = get_request_fingerprint(self.crawler, request.replace(url=url)) + if url_field != "request_url": + to_check[fp].append(i) + + # Storing the fingerprint-to-URL mapping for the item only. + # This will be used when storing the item in the Collection. + fingerprint_to_url_map.add((fp, url)) + + if url_field == "url": + self.crawler.stats.inc_value( # type: ignore[union-attr] + "incremental_crawling/redirected_urls" + ) + logger.debug( + f"Request URL for the item {request.url} was redirected to {url}." + ) + + # Prepare list of duplications + duplicated_fingerprints = await self.fm.get_existing_fingerprints_async( + list(to_check.keys()) + ) + + if duplicated_fingerprints: + logging.debug( + f"Skipping {len(duplicated_fingerprints)} Request fingerprints that were processed previously." + ) + + n_dups = 0 + for dupe_fp in duplicated_fingerprints: + # Marking duplicates for removal as None + for index in to_check[dupe_fp]: + result[index] = None + n_dups += 1 + + filtered_result = [x for x in result if x is not None] + + self.crawler.stats.inc_value( # type: ignore[union-attr] + "incremental_crawling/filtered_items_and_requests", n_dups + ) + # Check for any new fingerprints and their corresponding URLs for the item + fingerprint_url_map_new = { + (fp, url) + for fp, url in fingerprint_to_url_map + if fp not in duplicated_fingerprints + } + # Add any new fingerprints and their corresponding URLs to the batch for future saving + if fingerprint_url_map_new: + self.fm.add_to_batch(fingerprint_url_map_new) + return filtered_result + + def _get_unique_urls( + self, request_url: str, item: Optional[Item], discard_request_url: bool = False + ) -> Dict[str, Optional[str]]: + """Retrieves a dictionary of unique URLs associated with an item.""" + + urls: Dict[str, Optional[str]] = {request_url: "request_url"} + if not item: + return urls + + url_fields = ["url", "canonicalUrl"] + + adapter = ItemAdapter(item) + for url_field in url_fields: + if (url := adapter[url_field]) and url not in urls: + urls[url] = url_field + + if discard_request_url: + urls.pop(request_url) + + return urls diff --git a/zyte_spider_templates/_incremental/middleware.py b/zyte_spider_templates/_incremental/middleware.py new file mode 100644 index 0000000..06bc8e6 --- /dev/null +++ b/zyte_spider_templates/_incremental/middleware.py @@ -0,0 +1,70 @@ +import logging +from typing import AsyncGenerator, Union + +from scrapinghub.client.exceptions import Unauthorized +from scrapy.crawler import Crawler +from scrapy.exceptions import CloseSpider, NotConfigured +from scrapy.http import Request +from zyte_common_items import Item + +from .manager import CollectionsFingerprintsManager, IncrementalCrawlingManager + +logger = logging.getLogger(__name__) + + +class IncrementalCrawlMiddleware: + """:ref:`Downloader middleware ` to skip + items seen in previous crawls. + + To enable this middleware, set the :setting:`INCREMENTAL_CRAWL_ENABLED` + setting to ``True``. + + This middleware keeps a record of URLs of crawled items in the :ref:`Zyte Scrapy Cloud + collection ` specified in the :setting:`INCREMENTAL_CRAWL_COLLECTION_NAME` + setting, and skips items, responses and requests with matching URLs. + + Use :setting:`INCREMENTAL_CRAWL_BATCH_SIZE` to fine-tune interactions with + the collection for performance. + """ + + def __init__(self, crawler: Crawler): + assert crawler.spider + if not crawler.spider.settings.getbool("INCREMENTAL_CRAWL_ENABLED", False): + raise NotConfigured( + "IncrementalCrawlMiddleware is not enabled. Set the " + "INCREMENTAL_CRAWL_ENABLED setting to True to enable it." + ) + self.inc_manager: IncrementalCrawlingManager = self.prepare_incremental_manager( + crawler + ) + + @staticmethod + def prepare_incremental_manager(crawler): + try: + collection_fp = CollectionsFingerprintsManager(crawler) + except (AttributeError, Unauthorized, RuntimeError, ValueError) as exc_info: + logger.error( + f"IncrementalCrawlMiddleware is enabled, but something went wrong with Collections.\n" + f"The reason: {exc_info}" + ) + raise CloseSpider("incremental_crawling_middleware_collection_issue") + + return IncrementalCrawlingManager(crawler, collection_fp) + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler) + + async def process_spider_output( + self, response, result, spider + ) -> AsyncGenerator[Union[Request, Item], None]: + result_list = [] + async for item_or_request in result: + result_list.append(item_or_request) + + unique_items_or_requests = await self.inc_manager.process_incremental_async( + response.request, result_list + ) + + for item_or_request in unique_items_or_requests: + yield item_or_request diff --git a/zyte_spider_templates/feeds.py b/zyte_spider_templates/feeds.py new file mode 100644 index 0000000..eeec03e --- /dev/null +++ b/zyte_spider_templates/feeds.py @@ -0,0 +1,56 @@ +from typing import List, Set, Union + +import feedparser +from scrapy.utils.python import unique +from w3lib.html import strip_html5_whitespace +from w3lib.url import canonicalize_url +from web_poet import AnyResponse, BrowserResponse, HttpResponse, RequestUrl, ResponseUrl + + +def unique_urls(urls: List[str]) -> List[str]: + return unique(urls, key=canonicalize_url) + + +def get_feed_urls( + response: Union[AnyResponse, HttpResponse, BrowserResponse] +) -> Set[str]: + """Find all RSS or Atom feeds from a page""" + feed_urls = set() + + for link in response.xpath("//link[@type]"): + link_type: str = strip_html5_whitespace(link.attrib["type"]) + link_href: Union[str, RequestUrl, ResponseUrl] = strip_html5_whitespace( + link.attrib.get("href", "") + ) + if link_href: + link_href = response.urljoin(link_href) + rss_url = atom_url = None + if "rss+xml" in link_type: + rss_url = link_href + elif "atom+xml" in link_type: + atom_url = link_href + feed_url = rss_url or atom_url + if feed_url: + feed_urls.add(str(feed_url)) + + for link in response.xpath("//a/@href").getall(): + link_href = strip_html5_whitespace(link) + if link_href.endswith("rss.xml"): + feed_urls.add(str(response.urljoin(link_href))) + + return feed_urls + + +def parse_feed( + response: Union[AnyResponse, HttpResponse, BrowserResponse] +) -> List[str]: + response_text = ( + str(response.html) if isinstance(response, BrowserResponse) else response.text + ) + + feed = feedparser.parse(response_text) + urls = [ + strip_html5_whitespace(entry.get("link", "")) + for entry in feed.get("entries", []) + ] + return unique_urls([str(response.urljoin(url)) for url in urls if url]) diff --git a/zyte_spider_templates/heuristics.py b/zyte_spider_templates/heuristics.py index eba3639..0d7915b 100644 --- a/zyte_spider_templates/heuristics.py +++ b/zyte_spider_templates/heuristics.py @@ -1,12 +1,21 @@ import re +from typing import List, Tuple from urllib.parse import urlparse, urlsplit +from scrapy.link import Link +from scrapy.linkextractors import IGNORED_EXTENSIONS +from web_poet import BrowserResponse + from zyte_spider_templates._geolocations import GEOLOCATION_OPTIONS from zyte_spider_templates._lang_codes import LANG_CODES as _LANG_CODES COUNTRY_CODES = set([k.lower() for k in GEOLOCATION_OPTIONS]) LANG_CODES = set(_LANG_CODES) +ATOM_PATTERN = re.compile(r"]*>.*?]*>.*?", re.IGNORECASE | re.DOTALL) +RDF_PATTERN = re.compile(r"]*>\s*]*>", re.IGNORECASE) +RSS_PATTERN = re.compile(r"]*>\s*]*>", re.IGNORECASE) + NO_CONTENT_KEYWORDS = ( "authenticate", @@ -46,6 +55,71 @@ r"/terms[_-]of[_-](service|use|conditions)", ) +NO_ARTICLES_CONTENT_PATHS = ( + "/archive", + "/about", + "/about-us", + "/account", + "/admin", + "/affiliate", + "/authenticate", + "/best-deals", + "/careers", + "/cart", + "/checkout", + "/contactez-nous", + "/cookie-policy", + "/my-account", + "/my-wishlist", + "/press", + "/pricing", + "/privacy-policy", + "/returns", + "/rss.xml", + "/search", + "/settings", + "/shipping", + "/subscribe", + "/terms-conditions", + "/tos", +) + + +SEED_URL_RE = re.compile(r"^https?:\/\/[^:\/\s]+(:\d{1,5})?(\/[^\s]*)*(#[^\s]*)?") + +NON_HTML_FILE_EXTENSION_RE = re.compile( + ".*(?:{}$)".format("|".join(re.escape("." + ext) for ext in IGNORED_EXTENSIONS)), + re.IGNORECASE, +) + +SOCIAL_DOMAINS = ( + "facebook.com", + "youtube.com", + "youtu.be", + "twitter.com", + "t.co", + "instagram.com", + "mail.yahoo.com", + "plus.google.com", + "play.google.com", + "www.google.com", + "itunes.apple.com", + "login.yahoo.com", + "consent.yahoo.com", + "outlook.live.com", + "linkedin.com", + "vk.com", + "www.odnoklassniki.ru", + "api.whatsapp.com", + "telegram.me", + "telegram.org", + # ads + "doubleclick.net", +) +domains = "|".join(re.escape(domain) for domain in SOCIAL_DOMAINS) +pattern = rf"(?:^(?:[./])(?:{domains})|\b(?:{domains}))$" +SOCIAL_DOMAINS_RE = re.compile(pattern) + def might_be_category(url: str) -> bool: """Returns True if the given url might be a category based on its path.""" @@ -104,3 +178,94 @@ def _url_has_locale_pair(url_path: str) -> bool: if y in LANG_CODES and x in COUNTRY_CODES: return True return False + + +def is_comments_article_feed(url: str) -> bool: + """ + Try to guess if a feed URL is for comments, not for articles. + """ + if "comments/feed" in url or "feed=comments-rss2" in url: + return True + return False + + +def is_non_html_file(url: str) -> bool: + """ + True for urls with extensions that clearly are not HTML. For example, + they are images, or a compressed file, etc. + >>> is_non_html_file("http://example.com/article") + False + >>> is_non_html_file("http://example.com/image.jpg") + True + """ + return bool(NON_HTML_FILE_EXTENSION_RE.match(url)) + + +def is_social_link(url: str) -> bool: + """ + True for urls corresponding to the typical social networks + >>> is_social_link("http://facebook.com") + True + >>> is_social_link("http://www.facebook.com") + True + >>> is_social_link("http://rrr.t.co") + True + >>> is_social_link("http://t.co") + True + >>> is_social_link("http://sport.co") + False + >>> is_social_link("http://sport.com") + False + >>> is_social_link("http://example.com") + False + """ + netloc = urlsplit(url).netloc + + if SOCIAL_DOMAINS_RE.search(netloc): + return True + return False + + +def classify_article_crawling_links(links: List[Link]) -> Tuple[List[Link], List[Link]]: + """In accordance with the rules, it divides the list of links into two new lists with allowed and disallowed links. + Returns a tuple of these new lists.""" + allowed_links = [] + disallowed_links = [] + for link in links: + url = link.url + if ( + is_social_link(url) + or is_non_html_file(url) + or url.endswith(NO_ARTICLES_CONTENT_PATHS) + ): + disallowed_links.append(link) + continue + allowed_links.append(link) + + return allowed_links, disallowed_links + + +def classify_article_feed_links(links: List[Link]) -> Tuple[List[Link], List[Link]]: + """In accordance with the rules, it divides the list of urls into two new lists with allowed and disallowed urls. + Returns a tuple of these new lists.""" + allowed_links = [] + disallowed_links = [] + for link in links: + if is_comments_article_feed(link.url): + disallowed_links.append(link) + continue + allowed_links.append(link) + return allowed_links, disallowed_links + + +def is_feed_content(response: BrowserResponse) -> bool: + # RSS 0.91, 0.92, 2.0 + if RSS_PATTERN.search(response.html): + return True + # Atom feed + if ATOM_PATTERN.search(response.html): + return True + # RSS 1.0/RDF + if RDF_PATTERN.search(response.html): + return True + return False diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py index 2cd8019..0692990 100644 --- a/zyte_spider_templates/middlewares.py +++ b/zyte_spider_templates/middlewares.py @@ -3,11 +3,35 @@ import warnings from collections import defaultdict from datetime import datetime -from typing import Any, Dict +from typing import ( + Any, + AsyncIterable, + Dict, + Generator, + Iterable, + List, + Optional, + Set, + Union, +) from warnings import warn from scrapy import Request, Spider -from scrapy.spidermiddlewares.offsite import OffsiteMiddleware +from scrapy.crawler import Crawler +from scrapy.dupefilters import RFPDupeFilter +from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy.http import Response +from scrapy.utils.httpobj import urlparse_cached +from scrapy.utils.url import url_is_from_any_domain +from scrapy_poet import DynamicDeps +from zyte_common_items import Article, ArticleNavigation, Item + +try: + from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware +except ImportError: + from scrapy.spidermiddlewares.offsite import OffsiteMiddleware # type: ignore[assignment] + +from zyte_spider_templates.utils import get_domain logger = logging.getLogger(__name__) @@ -23,10 +47,10 @@ class CrawlingLogsMiddleware: JSON-formatted data so that it can easily be parsed later on. Some notes: - - ``scrapy.utils.request.request_fingerprint`` is used to match what - https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy uses. - This makes it easier to work with since we can easily match it with - the fingerprints logged in Scrapy Cloud's request data. + - ``scrapy.utils.request.request_fingerprint`` is used to match what + https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy uses. + This makes it easier to work with since we can easily match it with + the fingerprints logged in Scrapy Cloud's request data. """ unknown_page_type = "unknown" @@ -117,9 +141,505 @@ def crawl_logs(self, response, result): class AllowOffsiteMiddleware(OffsiteMiddleware): - def _filter(self, request: Any, spider: Spider) -> bool: - if not isinstance(request, Request): - return True + def should_follow(self, request: Request, spider: Spider) -> bool: if request.meta.get("allow_offsite"): return True - return super()._filter(request, spider) + return super().should_follow(request, spider) + + +class MaxRequestsPerSeedDownloaderMiddleware: + """This middleware limits the number of requests that each seed request can subsequently + have. + + To enable this middleware, set the ``MAX_REQUESTS_PER_SEED`` setting to + the desired positive value. Non-positive integers (i.e. 0 and below) + imposes no limit and disables this middleware. + + By default, all start requests are considered seed requests, and all other + requests are not. + + Please note that you also need to enable TrackSeedsSpiderMiddleware to make this work. + """ + + def __init__(self, crawler: Crawler): + assert crawler.spider + max_requests_per_seed = max( + 0, crawler.spider.settings.getint("MAX_REQUESTS_PER_SEED", 0) + ) + if not max_requests_per_seed: + raise NotConfigured( + "MaxRequestsPerSeedDownloaderMiddleware is not configured. " + "Set MAX_REQUESTS_PER_SEED to enable it." + ) + self.crawler = crawler + self.requests_per_seed: defaultdict = defaultdict(int) + self.seeds_reached_limit: Set[str] = set() + self.max_requests_per_seed = max_requests_per_seed + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler) + + def process_request(self, request, spider): + seed = request.meta.get("seed") + if seed is None: + return + if self.max_requests_per_seed_reached(seed): + self.seeds_reached_limit.add(seed) + logging.debug( + f"The request {request} is skipped as {self.max_requests_per_seed} " + f"max requests per seed have been reached for seed {seed}." + ) + assert self.crawler.stats + self.crawler.stats.set_value( + "seeds/max_requests_reached", len(self.seeds_reached_limit) + ) + raise IgnoreRequest("max_requests_per_seed_reached") + self.requests_per_seed[seed] += 1 + return + + def max_requests_per_seed_reached(self, seed: str) -> bool: + return self.requests_per_seed.get(seed, 0) >= self.max_requests_per_seed + + +class TrackSeedsSpiderMiddleware: + def __init__(self, crawler: Crawler): + self.crawler = crawler + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler) + + def process_start_requests( + self, start_requests: Iterable[Request], spider: Spider + ) -> Iterable[Request]: + for request in start_requests: + request.meta.setdefault("seed", request.url) + request.meta.setdefault("is_seed_request", True) + yield request + + def process_spider_output( + self, + response: Response, + result: Iterable[Union[Request, Item]], + spider: Spider, + ) -> Iterable[Union[Request, Item]]: + for item_or_request in result: + if not isinstance(item_or_request, Request): + yield item_or_request + continue + + yield from self._process_request(item_or_request, response) + + async def process_spider_output_async( + self, + response: Response, + result: AsyncIterable[Union[Request, Item]], + spider: Spider, + ) -> AsyncIterable[Union[Request, Item]]: + async for item_or_request in result: + if not isinstance(item_or_request, Request): + yield item_or_request + continue + + for processed_request in self._process_request(item_or_request, response): + yield processed_request + + def _process_request( + self, request: Request, response: Response + ) -> Iterable[Request]: + seed = request.meta.get("seed", response.meta.get("seed")) + if seed is None: + # we don't want to add a seed meta key with None if it is not in meta + yield request + return + + request.meta["seed"] = seed + yield request + + +class PageParamsMiddlewareBase: + def __init__(self, crawler): + self.crawler = crawler + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def process_start_requests( + self, start_requests: List[Request], spider: Spider + ) -> Iterable[Request]: + for request in start_requests: + self._update_page_params(request) + yield request + + def process_spider_output( + self, response, result, spider + ) -> Iterable[Union[Request, Item]]: + for item_or_request in result: + if isinstance(item_or_request, Request): + self._update_page_params(item_or_request) + yield item_or_request + + async def process_spider_output_async( + self, response, result, spider + ) -> AsyncIterable[Union[Request, Item]]: + async for item_or_request in result: + if isinstance(item_or_request, Request): + self._update_page_params(item_or_request) + yield item_or_request + + def _update_page_params(self, request) -> None: + page_params = request.meta.setdefault("page_params", {}) + self.update_page_params(request, page_params) + + def update_page_params(self, request, page_params) -> None: + pass + + +class TrackNavigationDepthSpiderMiddleware(PageParamsMiddlewareBase): + """ + This middleware helps manage navigation depth by setting a `final_navigation_page` meta key + when the predefined depth limit (`NAVIGATION_DEPTH_LIMIT`) is reached. + + .. note:: + Navigation depth is typically increased for requests that navigate to a subcategory + originating from its parent category, such as a request targeting a category starting + from the website home page. However, it may not be necessary to increase navigation + depth, for example, for the next pagination requests. + Spiders can customize this behavior as needed by controlling when navigation depth is incremented. + """ + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def __init__(self, crawler): + if max_navigation_depth := max( + crawler.spider.settings.getint("NAVIGATION_DEPTH_LIMIT", 0), 0 + ): + self.max_navigation_depth = max_navigation_depth + self.stats = crawler.stats + else: + raise NotConfigured( + "TrackNavigationDepthSpiderMiddleware is not configured. " + "Set `NAVIGATION_DEPTH_LIMIT` to 1 or more to enable it." + ) + + def update_page_params(self, request, page_params) -> None: + page_params["skip_subcategories"] = request.meta.get( + "final_navigation_page", page_params.get("skip_subcategories") + ) + + def process_start_requests( + self, start_requests: List[Request], spider: Spider + ) -> Iterable[Request]: + for request in super().process_start_requests(start_requests, spider): + # We treat the initial response as having a navigation_depth of 1. + self._update_request_with_navigation(request, navigation_depth=1) + self.stats.inc_value("navigation_depth/inits") + yield request + + def process_spider_output( + self, response, result, spider + ) -> Iterable[Union[Request, Item]]: + for item_or_request in super().process_spider_output(response, result, spider): + if not isinstance(item_or_request, Request): + yield item_or_request + continue + + if req := self._process_navigation_depth(item_or_request, response): + yield req + + async def process_spider_output_async( + self, response, result, spider + ) -> AsyncIterable[Union[Request, Item]]: + async for item_or_request in super().process_spider_output_async( + response, result, spider + ): + if not isinstance(item_or_request, Request): + yield item_or_request + continue + + if req := self._process_navigation_depth(item_or_request, response): + yield req + + def _update_request_with_navigation(self, request, navigation_depth): + if navigation_depth is None: + return + request.meta["navigation_depth"] = navigation_depth + request.meta["final_navigation_page"] = ( + navigation_depth >= self.max_navigation_depth + ) + + def _current_navigation_depth( + self, increase_navigation_depth, current_navigation_depth + ): + if increase_navigation_depth and current_navigation_depth is None: + current_navigation_depth = 1 + return current_navigation_depth + + def _process_navigation_depth(self, request, response) -> Optional[Request]: + increase_navigation_depth = request.meta.get("increase_navigation_depth", True) + current_navigation_depth = self._current_navigation_depth( + increase_navigation_depth, response.meta.get("navigation_depth") + ) + + if not increase_navigation_depth: + self._update_request_with_navigation(request, current_navigation_depth) + + self.stats.inc_value("navigation_depth/not_counted") + return request + + self.stats.inc_value(f"navigation_depth/count/{current_navigation_depth}") + + self.stats.max_value("navigation_depth/max_seen", current_navigation_depth) + self._update_request_with_navigation(request, current_navigation_depth + 1) + + return request + + +class OnlyFeedsMiddleware(PageParamsMiddlewareBase): + """ + This middleware helps control whether the spider should discover all links on the webpage + or extract links from RSS/Atom feeds only. + """ + + def __init__(self, crawler: Crawler): + super().__init__(crawler) + assert crawler.spider + if not crawler.spider.settings.getbool("ONLY_FEEDS_ENABLED"): # type: ignore[union-attr] + raise NotConfigured( + "OnlyFeedsMiddleware is not configured. Set " + "ONLY_FEEDS_ENABLED to True to enable it." + ) + + def update_page_params(self, request, page_params) -> None: + page_params["only_feeds"] = request.meta.get( + "only_feeds", page_params.get("only_feeds", True) + ) + + +class OffsiteRequestsPerSeedMiddleware: + """This middleware ensures that subsequent requests for each seed do not go outside + the original seed's domain. + + However, offsite requests are allowed only if it came from the original domain. Any + other offsite requests that follow from offsite responses will not be allowed. This + behavior allows to crawl articles from news aggregator websites while ensuring it + doesn't fully crawl other domains it discover. + + Disabling the middleware would not prevent offsite requests from being filtered + and might generally lead in other domains from being crawled completely, unless + ``allowed_domains`` is set in the spider. + + This middleware relies on :class:`~zyte_spider_templates.TrackSeedsSpiderMiddleware` + to set the `"seed"` and `"is_seed_request"` values in + :attr:`Request.meta `. Ensure that such middleware is + active and sets the said values before this middleware processes the spiders outputs. + + .. note:: + + If a seed URL gets redirected to a different domain, both the domain from + the original request and the domain from the redirected response will be + used as references. + + If the seed URL is `https://books.toscrape.com`, all subsequent requests to + `books.toscrape.com` and its subdomains are allowed, but requests to + `toscrape.com` are not. Conversely, if the seed URL is `https://toscrape.com`, + requests to both `toscrape.com` and `books.toscrape.com` are allowed. + """ + + def __init__(self, crawler: Crawler): + assert crawler.spider + if not crawler.spider.settings.getbool( # type: ignore[union-attr] + "OFFSITE_REQUESTS_PER_SEED_ENABLED", True + ): + raise NotConfigured( + "OffsiteRequestsPerSeedMiddleware is not enabled. Set the " + "OFFSITE_REQUESTS_PER_SEED_ENABLED setting to True to enable " + "it." + ) + + self.stats = crawler.stats + self.allowed_domains_per_seed: Dict[str, Set[str]] = defaultdict(set) + self.domains_seen: Set[str] = set() + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def process_spider_output( + self, + response: Response, + result: Iterable[Union[Request, Item]], + spider: Spider, + ) -> Iterable[Union[Request, Item]]: + self._fill_allowed_domains_per_seed_dict(response) + for item_or_request in result: + if not isinstance(item_or_request, Request): + yield item_or_request + continue + + if self.allow_request(item_or_request, response): + yield item_or_request + + async def process_spider_output_async( + self, + response: Response, + result: AsyncIterable[Union[Request, Item]], + spider: Spider, + ) -> AsyncIterable[Union[Request, Item]]: + self._fill_allowed_domains_per_seed_dict(response) + async for item_or_request in result: + if not isinstance(item_or_request, Request): + yield item_or_request + continue + + if self.allow_request(item_or_request, response): + yield item_or_request + + def allow_request(self, request: Request, response: Response) -> bool: + if request.dont_filter: + return True + + if self._is_domain_per_seed_allowed(request): + return True + elif self._is_domain_per_seed_allowed(response): + # At this point, we know that the request points to an offsite page. + # We don't want to immediately filter it as it might be an article from news + # aggregator websites. So, we simply check if the request came from the + # original website. Otherwise, it came from offsite pages and we avoid it. + return True + + domain = urlparse_cached(request).hostname + assert self.stats + if domain and domain not in self.domains_seen: + self.domains_seen.add(domain) + self.stats.inc_value("offsite_requests_per_seed/domains") + self.stats.inc_value("offsite_requests_per_seed/filtered") + + logger.debug(f"Filtered offsite request per seed to {domain}: {request}") + return False + + def _fill_allowed_domains_per_seed_dict(self, response: Response) -> None: + seed = response.meta.get("seed") + if seed is None: + return + + if not response.meta.get("is_seed_request"): + if domains_for_update := response.meta.get("seed_domains"): + self.allowed_domains_per_seed[seed].update(domains_for_update) + return + + domains_for_update = response.meta.get( + "seed_domains", self._get_allowed_domains(response) + ) + self.allowed_domains_per_seed[seed].update(domains_for_update) + + def _is_domain_per_seed_allowed( + self, req_or_resp: Union[Request, Response] + ) -> bool: + seed = req_or_resp.meta.get("seed") + if seed is None: + return True + + if allowed_domains := self.allowed_domains_per_seed.get(seed): + return url_is_from_any_domain(req_or_resp.url, allowed_domains) + + return False + + def _get_allowed_domains(self, response: Response) -> Set[str]: + """ + Returns the domains based on the URL attributes of items from a response and the originating request. + + In cases where the original request URL was redirected to a new domain, + the new domain would be included as well. + """ + + def get_item_and_request_urls() -> Generator[str, None, None]: + """Since the redirected URL and canonicalUrl are only in the Item, + we try to extract it from the first item encountered.""" + for _, maybe_item in response.cb_kwargs.items(): + if isinstance(maybe_item, DynamicDeps): + for item_class in [Article, ArticleNavigation]: + if item := maybe_item.get(item_class): + for url_type in ("canonicalUrl", "url"): + if url := getattr(item, url_type, None): + yield url + break + else: + logger.debug( + f"This type of item: {type(maybe_item)} is not allowed" + ) + assert response.request + yield response.request.url + + return {get_domain(url) for url in get_item_and_request_urls()} + + +class DummyDupeFilter(RFPDupeFilter): + """ + This class overrides the `request_seen` method to return `False` for all requests, + disabling Scrapy's built-in duplicate filtering. Instead, deduplication + is performed in `DupeFilterDownloaderMiddleware` before requests are passed to other + middlewares. + """ + + def request_seen(self, request: Request) -> bool: + return False + + +class DupeFilterSpiderMiddleware: + """ + This middleware uses a custom duplicate filter to override Scrapy's default filtering, + leveraging the `DummyDupeFilter` to bypass global deduplication. Instead, + deduplication is managed within the middleware itself, filtering out duplicate requests + before they reach other middlewares. + """ + + dupe_filter: RFPDupeFilter = RFPDupeFilter() + + def __init__(self, crawler): + self.crawler = crawler + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def process_start_requests( + self, start_requests: List[Request], spider: Spider + ) -> Iterable[Request]: + for request in start_requests: + if not self.url_already_seen(None, request): + yield request + + def process_spider_output( + self, response, result, spider + ) -> Iterable[Union[Request, Item]]: + for item_or_request in result: + if isinstance(item_or_request, Request): + if not self.url_already_seen(response, item_or_request): + yield item_or_request + else: + yield item_or_request + + async def process_spider_output_async( + self, response, result, spider + ) -> AsyncIterable[Union[Request, Item]]: + async for item_or_request in result: + if isinstance(item_or_request, Request): + if not self.url_already_seen(response, item_or_request): + yield item_or_request + else: + yield item_or_request + + def url_already_seen(self, response: Optional[Response], request: Request) -> bool: + """A custom replacement for the default duplicate filtering, tracking URLs seen in this run.""" + if not request.dont_filter and self.dupe_filter.request_seen(request): + logger.debug( + f"URL is duplicated {request.url}, for the response {response.url if response else 'start_request'}." + ) + self.crawler.stats.inc_value("dupe_filter_spider_mw/url_already_seen") + return True + return False diff --git a/zyte_spider_templates/pages/__init__.py b/zyte_spider_templates/pages/__init__.py new file mode 100644 index 0000000..2867be8 --- /dev/null +++ b/zyte_spider_templates/pages/__init__.py @@ -0,0 +1,2 @@ +from .article_heuristics import HeuristicsArticleNavigationPage +from .product_navigation_heuristics import HeuristicsProductNavigationPage diff --git a/zyte_spider_templates/pages/article_heuristics.py b/zyte_spider_templates/pages/article_heuristics.py new file mode 100644 index 0000000..68c39b6 --- /dev/null +++ b/zyte_spider_templates/pages/article_heuristics.py @@ -0,0 +1,189 @@ +import json +import logging +from typing import Iterable, List + +import attrs +import xtractmime +from scrapy.http import TextResponse +from scrapy.link import Link +from scrapy.linkextractors import LinkExtractor +from web_poet import AnyResponse, HttpResponse, PageParams, Stats, field, handle_urls +from web_poet.utils import cached_method +from zyte_common_items import ( + BaseArticleNavigationPage, + ProbabilityMetadata, + ProbabilityRequest, +) + +from zyte_spider_templates.feeds import get_feed_urls, parse_feed +from zyte_spider_templates.heuristics import ( + classify_article_crawling_links, + classify_article_feed_links, +) + +from ..heuristics import is_feed_content + +logger = logging.getLogger(__name__) + + +def is_feed_request(request: ProbabilityRequest) -> bool: + return bool( + request.name + and request.name.startswith("[heuristics][articleNavigation][feed]") + ) + + +@handle_urls("") +@attrs.define +class HeuristicsArticleNavigationPage(BaseArticleNavigationPage): + response: AnyResponse + stats: Stats + page_params: PageParams + _ARTICLE_HEURISTIC = {"name": "article", "dummy probability": 0.5} + _NAVIGATION_HEURISTIC = {"name": "subCategories", "dummy probability": 0.5} + _FEED_HEURISTIC = {"name": "feed", "dummy probability": 1.0} + _FEED_ITEMS_HEURISTIC = {"name": "feed items", "dummy probability": 0.99} + + @field + def url(self) -> str: + return str(self.response.url) + + @field + def subCategories(self) -> Iterable[ProbabilityRequest]: + if self._is_response_feed(): + return + + feeds = self._get_feed_links() + feed_urls = {link.url for link in feeds} + for link in feeds: + yield self._get_request(link, self._FEED_HEURISTIC) + + if self.skip_subcategories() or self.is_only_feeds(): + return + + sub_categories = [ + link + for link in self._get_article_or_navigation_links() + if link.url not in feed_urls + ] + for link in sub_categories: + yield self._get_request(link, self._NAVIGATION_HEURISTIC) + + @field + def items(self) -> Iterable[ProbabilityRequest]: + if self._is_response_feed(): + links = self._get_feed_items_links() + heuristic = self._FEED_ITEMS_HEURISTIC + elif not self.is_only_feeds(): + links = self._get_article_or_navigation_links() + heuristic = self._ARTICLE_HEURISTIC + else: + return + + for link in links: + yield self._get_request(link, heuristic) + + @cached_method + def _get_article_or_navigation_links(self) -> List[Link]: + """Extract links from an HTML web page.""" + response = TextResponse( + url=str(self.response.url), body=self.response.text.encode() + ) + link_extractor = LinkExtractor() + links = link_extractor.extract_links(response) + allowed_links, disallowed_links = classify_article_crawling_links(links) + + _log_and_stats( + self, + "heuristic_navigation_or_article", + links, + allowed_links, + disallowed_links, + ) + return allowed_links + + @cached_method + def _get_feed_items_links(self) -> List[Link]: + """Extract links from an RSS/Atom feed.""" + links = [Link(url) for url in parse_feed(self.response)] + allowed_links, disallowed_links = classify_article_crawling_links(links) + + _log_and_stats( + self, "heuristic_feed_items", links, allowed_links, disallowed_links + ) + return allowed_links + + @cached_method + def _get_feed_links(self) -> List[Link]: + """Extract links to RSS/Atom feeds form an HTML web page.""" + links = [Link(url) for url in get_feed_urls(self.response)] + allowed_links, disallowed_links = classify_article_feed_links(links) + + _log_and_stats(self, "heuristic_feed", links, allowed_links, disallowed_links) + return allowed_links + + @cached_method + def _is_response_feed(self) -> bool: + """Return True if a response is an RSS or Atom feed.""" + + content_type = "" + if isinstance(self.response.response, HttpResponse): + content_type = self.response.response.headers.get("Content-Type", "") + elif is_feed_content(self.response.response): + logger.warning( + "It is likely that the spider is using BrowserHtml to extract the RSS feed. " + "Please note that using HttpResponse is more efficient." + ) + return True + + mime_type = xtractmime.extract_mime( + self.response.text.encode(), + content_types=(content_type.encode(),), + ) + + return xtractmime.mimegroups.is_xml_mime_type( + mime_type + ) or xtractmime.mimegroups.is_json_mime_type(mime_type) + + def _get_request(self, link, heuristic) -> ProbabilityRequest: + return ProbabilityRequest( + url=link.url, + name=f"[heuristics][articleNavigation][{heuristic['name']}] {link.text.strip()}", + metadata=ProbabilityMetadata(probability=heuristic["dummy probability"]), + ) + + def skip_subcategories(self) -> bool: + return self.page_params.get("skip_subcategories", False) + + def is_only_feeds(self) -> bool: + return self.page_params.get("only_feeds", False) + + +def _log_and_stats(self, urls_type, links, allowed_links, disallowed_links): + _logs(self, urls_type, links, allowed_links, disallowed_links) + _stats(self, urls_type, links, allowed_links, disallowed_links) + + +def _stats(page, urls_type, urls, allowed_urls, disallowed_urls): + page.stats.inc(f"article_spider/{urls_type}/visited", 1) + page.stats.inc(f"article_spider/{urls_type}/no_links", 0 if urls else 1) + page.stats.inc(f"article_spider/{urls_type}/with_links", 1 if urls else 0) + page.stats.inc(f"article_spider/{urls_type}/links/total", len(urls)) + page.stats.inc(f"article_spider/{urls_type}/links/allow", len(allowed_urls)) + page.stats.inc(f"article_spider/{urls_type}/links/disallow", len(disallowed_urls)) + + +def _logs(page, urls_type, urls, allowed_urls, disallowed_urls): + page_name = page.item_cls.__name__ + data = { + "page": page_name, + "page url": page.url, + "urls type": urls_type, + "urls found": len(urls), + "allowed urls": len(allowed_urls), + "urls to skip": len(disallowed_urls), + "list of urls to skip": [ + url.url if isinstance(url, Link) else url for url in disallowed_urls + ], + } + logger.debug(f"Article Heuristic Logs:\n{json.dumps(data, indent=2)}") diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index d844fc2..418c0d5 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -10,6 +10,7 @@ ConfigDict, Field, Json, + NonNegativeInt, field_validator, model_validator, ) @@ -391,3 +392,14 @@ class CustomAttrsMethodParam(BaseModel): }, }, ) + + +class MaxRequestsPerSeedParam(BaseModel): + max_requests_per_seed: Optional[NonNegativeInt] = Field( + title="Max requests per seed", + description=( + "The maximum number of follow-up requests allowed per initial URL. " + "Unlimited if not set." + ), + default=None, + ) diff --git a/zyte_spider_templates/spiders/article.py b/zyte_spider_templates/spiders/article.py new file mode 100644 index 0000000..6b829d3 --- /dev/null +++ b/zyte_spider_templates/spiders/article.py @@ -0,0 +1,415 @@ +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional + +import attrs +import requests +import scrapy +from pydantic import BaseModel, ConfigDict, Field +from scrapy.crawler import Crawler +from scrapy.exceptions import CloseSpider +from scrapy_poet import DummyResponse, DynamicDeps +from scrapy_spider_metadata import Args +from web_poet import BrowserResponse, HttpResponse +from zyte_common_items import ( + Article, + ArticleNavigation, + ProbabilityMetadata, + ProbabilityRequest, +) + +from zyte_spider_templates.documentation import document_enum +from zyte_spider_templates.pages.article_heuristics import is_feed_request +from zyte_spider_templates.params import ( + INPUT_GROUP, + ExtractFrom, + ExtractFromParam, + GeolocationParam, + MaxRequestsParam, + MaxRequestsPerSeedParam, + UrlParam, + UrlsFileParam, + UrlsParam, +) +from zyte_spider_templates.spiders.base import ARG_SETTING_PRIORITY, BaseSpider + +from ..utils import load_url_list + +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + + +@attrs.define +class RequestTypeItemEnum: + name: str = attrs.field(default="no_name") + priority: int = attrs.field(default=0) + page_type: str = attrs.field(default="no_page_type") + inject: list = attrs.field(default=[]) + + +class RequestType(Enum): + SEED: RequestTypeItemEnum = RequestTypeItemEnum( + name="seed", + priority=40, + page_type="articleNavigation", + inject=[ArticleNavigation], + ) + ARTICLE: RequestTypeItemEnum = RequestTypeItemEnum( + name="article", priority=30, page_type="article", inject=[Article] + ) + ARTICLE_AND_NAVIGATION: RequestTypeItemEnum = RequestTypeItemEnum( + name="article_and_navigation", + priority=20, + page_type="articleNavigation-heuristics", + inject=[Article, ArticleNavigation], + ) + NAVIGATION: RequestTypeItemEnum = RequestTypeItemEnum( + name="navigation", + priority=10, + page_type="subCategories", + inject=[ArticleNavigation], + ) + NEXT_PAGE: RequestTypeItemEnum = RequestTypeItemEnum( + name="nextPage", priority=100, page_type="nextPage", inject=[ArticleNavigation] + ) + + +class IncrementalParam(BaseModel): + incremental: bool = Field( + description=( + "Skip items with URLs already stored in the specified Zyte Scrapy Cloud Collection. " + "This feature helps avoid reprocessing previously crawled items and requests by comparing " + "their URLs against the stored collection." + ), + default=False, + ) + incremental_collection_name: Optional[str] = Field( + description=( + "Name of the Zyte Scrapy Cloud Collection used during an incremental crawl." + "By default, a Collection named after the spider (or virtual spider) is used, " + "meaning that matching URLs from previous runs of the same spider are skipped, " + "provided those previous runs had `incremental` argument set to `true`." + "Using a different collection name makes sense, for example, in the following cases:" + "- different spiders share a collection." + "- the same spider uses different collections (e.g., for development runs vs. production runs)." + ), + default=None, + ) + + +@document_enum +class ArticleCrawlStrategy(str, Enum): + full: str = "full" + """Follow most links within each domain from the list of URLs in an + attempt to discover and extract as many articles as possible.""" + + direct_item: str = "direct_item" + """Treat input URLs as direct links to articles, and extract an + article from each.""" + + +class ArticleCrawlStrategyParam(BaseModel): + crawl_strategy: ArticleCrawlStrategy = Field( + title="Crawl Strategy", + description="Determines how input URLs and follow-up URLs are crawled.", + default=ArticleCrawlStrategy.full, + json_schema_extra={ + "enumMeta": { + ArticleCrawlStrategy.full: { + "title": "Full", + "description": ( + "Follow most links within each domain from the list of URLs in an " + "attempt to discover and extract as many articles as possible." + ), + }, + ArticleCrawlStrategy.direct_item: { + "title": "Direct URLs to Articles", + "description": ( + "Treat input URLs as direct links to articles, and " + "extract an article from each." + ), + }, + }, + }, + ) + + +class ArticleSpiderParams( + ExtractFromParam, + MaxRequestsPerSeedParam, + MaxRequestsParam, + GeolocationParam, + ArticleCrawlStrategyParam, + IncrementalParam, + UrlsFileParam, + UrlsParam, + UrlParam, + BaseModel, +): + model_config = ConfigDict( + json_schema_extra={ + "groups": [ + INPUT_GROUP, + ], + }, + ) + + +class ArticleSpider(Args[ArticleSpiderParams], BaseSpider): + """Yield articles from one or more websites that contain articles. + + See :class:`~zyte_spider_templates.spiders.article.ArticleSpiderParams` + for supported parameters. + + .. seealso:: :ref:`article`. + """ + + name: str = "article" + + metadata: Dict[str, Any] = { + **BaseSpider.metadata, + "title": "Article", + "description": "Template for spiders that extract article data from news or blog websites.", + } + + @classmethod + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: + spider = super(ArticleSpider, cls).from_crawler(crawler, *args, **kwargs) + spider._init_input() + spider._init_extract_from() + spider._init_incremental() + + if spider.args.max_requests_per_seed: + spider.settings.set( + "MAX_REQUESTS_PER_SEED", + spider.args.max_requests_per_seed, + priority=ARG_SETTING_PRIORITY, + ) + + return spider + + def _init_input(self): + urls_file = self.args.urls_file + if urls_file: + response = requests.get(urls_file) + urls = load_url_list(response.text) + self.logger.info(f"Loaded {len(urls)} initial URLs from {urls_file}.") + self.start_urls = urls + elif self.args.urls: + self.start_urls = self.args.urls + else: + self.start_urls = [self.args.url] + + def _init_extract_from(self): + if self.args.extract_from is not None: + self.settings.set( + "ZYTE_API_PROVIDER_PARAMS", + { + "articleOptions": {"extractFrom": self.args.extract_from}, + "articleNavigationOptions": {"extractFrom": self.args.extract_from}, + **self.settings.get("ZYTE_API_PROVIDER_PARAMS", {}), + }, + priority=ARG_SETTING_PRIORITY, + ) + + def _init_incremental(self): + self.settings.set( + "INCREMENTAL_CRAWL_ENABLED", + self.args.incremental, + priority=ARG_SETTING_PRIORITY, + ) + if self.args.incremental: + self.settings.set( + "NAVIGATION_DEPTH_LIMIT", + 1, + priority=ARG_SETTING_PRIORITY, + ) + self.logger.info( + "NAVIGATION_DEPTH_LIMIT=1 is set because the incremental crawling is enabled." + ) + if self.args.incremental_collection_name: + self.settings.set( + "INCREMENTAL_CRAWL_COLLECTION_NAME", + self.args.incremental_collection_name, + priority=ARG_SETTING_PRIORITY, + ) + self.logger.info( + f"INCREMENTAL_CRAWL_COLLECTION_NAME={self.args.incremental_collection_name} " + ) + + def _update_inject_meta(self, meta: Dict[str, Any], is_feed: bool) -> None: + """ + The issue: `HeuristicsArticleNavigationPage` has only `AnyResponse` as a dependency, so + the current implementation of `ScrapyZyteApiProvider` always uses `HttpResponse` + to produce the ArticleNavigation item, regardless of the `extract_from` argument. + + This function forces `browserHtml` extraction when `extract_from=browserHtml` + for Article and ArticleNavigation pages, while continuing to use + `HttpResponse` for feeds. + """ + + if is_feed: + inject = meta["inject"].copy() + inject.append(HttpResponse) + meta["inject"] = inject + return None + + if self.args.extract_from == ExtractFrom.browserHtml: + inject = meta["inject"].copy() + inject.append(BrowserResponse) + meta["inject"] = inject + return None + + def _update_request_name(self, req: ProbabilityRequest) -> None: + replacements = { + "[heuristics][articleNavigation][article]": "[article or subCategories]", + "[heuristics][articleNavigation][feed items]": "[feed items or subCategories]", + } + for old_name, new_name in replacements.items(): + req.name = (req.name or "").replace(old_name, new_name) + + def start_requests(self) -> Iterable[scrapy.Request]: + if self.args.crawl_strategy == ArticleCrawlStrategy.full: + request_type = RequestType.SEED + probability = None + elif self.args.crawl_strategy == ArticleCrawlStrategy.direct_item: + request_type = RequestType.ARTICLE + probability = 1.0 + else: + self.logger.error( + f"The strategy `{self.args.crawl_strategy}` is not supported. " + f"Currently, only these strategies are supported: `full` and `direct_item`." + ) + raise CloseSpider("not_supported_strategy_type") + + for url in self.start_urls: + meta = {"request_type": request_type} + yield self.get_parse_request( + ProbabilityRequest( + url=url, + name=f"[{request_type.value.name}]", + metadata=ProbabilityMetadata(probability=probability), + ), + meta=meta, + is_feed=False, + ) + + def parse_dynamic( + self, + response: DummyResponse, + dynamic: DynamicDeps, + ) -> Iterable[scrapy.Request]: + if Article in dynamic: + yield from self._parse_as_article(response, dynamic) + + if ArticleNavigation in dynamic: + yield from self._parse_as_navigation(response, dynamic) + + def _parse_as_article( + self, response: DummyResponse, dynamic: DynamicDeps + ) -> Iterable[scrapy.Request]: + yield dynamic[Article] + + def _parse_as_navigation( + self, response: DummyResponse, dynamic: DynamicDeps + ) -> Iterable[scrapy.Request]: + navigation = dynamic[ArticleNavigation] + + # Handle the nextPage link if it exists + if navigation.nextPage: + if not navigation.items: + self.logger.info( + f"Ignoring nextPage link {navigation.nextPage} since there " + f"are no article links found in {navigation.url}" + ) + else: + meta = { + "request_type": RequestType.NEXT_PAGE, + "increase_navigation_depth": False, + } + yield self.get_parse_request( + navigation.nextPage, meta=meta, is_feed=False + ) + + subcategories = navigation.subCategories or [] + items = navigation.items or [] + subcategories_urls = {req.url for req in subcategories} + items_urls = {req.url for req in items} + + # Preprocess the list of requests for final_navigation_page + if response.meta.get("final_navigation_page"): + self.logger.debug( + f"Navigation links from {response.url} response are not followed, because" + f"{response.meta.get('navigation_depth')} max navigation_depth has been reached." + ) + self.crawler.stats.inc_value("navigation_depth/final_navigation_page") # type: ignore[union-attr] + subcategories_urls -= items_urls + + # Iterate over both subcategories and items + for req in items + subcategories: + # Determine request type and meta information + # `increase_navigation_depth` and `is_feed` flags are clearly defined for each request type + if req.url in subcategories_urls: + if req.url not in items_urls: + # Subcategory request only + is_feed = is_feed_request(req) + increase_navigation_depth = not is_feed + request_type = RequestType.NAVIGATION + else: + # Request for both subcategory and item + self._update_request_name(req) + is_feed = False + increase_navigation_depth = True + request_type = RequestType.ARTICLE_AND_NAVIGATION + else: + # Article request only + is_feed = False + increase_navigation_depth = False + request_type = RequestType.ARTICLE + + meta = { + "request_type": request_type, + # processed here to be able to customize this value for each request type + "increase_navigation_depth": increase_navigation_depth, + } + + yield self.get_parse_request(req, meta=meta, is_feed=is_feed) + + def get_parse_request( + self, + request: ProbabilityRequest, + meta: Optional[Dict[Any, Any]] = None, + is_feed: bool = False, + **kwargs, + ) -> scrapy.Request: + meta = meta or {} + request_type = meta["request_type"].value + meta.update( + { + "crawling_logs": { + "name": request.name, + "page_type": request_type.page_type, + "probability": request.get_probability(), + }, + "inject": request_type.inject, + }, + ) + self._update_inject_meta(meta, is_feed) + + return request.to_scrapy( + callback=self.parse_dynamic, + errback=self.errback_navigation, + priority=request_type.priority, + meta=meta, + **kwargs, + ) + + def errback_navigation(self, failure) -> None: + """Request error""" + comm_msg = "article_spider/request_error" + deps = failure.request.meta["inject"] + deps_msg = "-".join([d.__name__[0].lower() + d.__name__[1:] for d in deps]) + assert self.crawler.stats + self.crawler.stats.inc_value(f"{comm_msg}/{deps_msg}") diff --git a/zyte_spider_templates/utils.py b/zyte_spider_templates/utils.py index b00cea2..c17e4ed 100644 --- a/zyte_spider_templates/utils.py +++ b/zyte_spider_templates/utils.py @@ -1,8 +1,16 @@ +import hashlib +import logging +import os import re -from typing import List +from typing import List, Optional +import tldextract +from scrapy.crawler import Crawler +from scrapy.http import Request from scrapy.utils.url import parse_url +logger = logging.getLogger(__name__) + _URL_PATTERN = r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$" @@ -26,3 +34,83 @@ def load_url_list(urls: str) -> List[str]: f"URL list contained the following invalid URLs:\n{bad_url_list}" ) return result + + +def get_domain_fingerprint(url: str) -> str: + """ + Create a consistent 2-byte domain fingerprint by combining partial hashes + of the main domain (without TLD) and the subdomain components. + """ + extracted = tldextract.extract(url) + main_domain = extracted.domain + subdomains = extracted.subdomain + + # Calculate partial hashes for each component + main_domain_hash = hashlib.sha1(main_domain.encode("utf-8")).hexdigest()[:2] + subdomain_hash = ( + hashlib.sha1(subdomains.encode("utf-8")).hexdigest()[:2] if subdomains else "00" + ) + + return main_domain_hash + subdomain_hash + + +def get_request_fingerprint(crawler: Crawler, request: Request) -> str: + """Create a fingerprint by including a domain-specific part.""" + + # Calculate domain fingerprint + domain_fingerprint = get_domain_fingerprint(request.url) + + # Calculate request fingerprint + request_fingerprint = crawler.request_fingerprinter.fingerprint(request).hex() # type: ignore[union-attr] + + # Combine the fingerprints by taking the 2-bytes (4 chars) domain fingerprint + # to create a domain-specific identifier. + # This optimization aids in efficient read/write operations in the Collection. + + return domain_fingerprint + request_fingerprint + + +def get_project_id(crawler: Crawler) -> Optional[str]: + """ + Retrieve the project ID required for IncrementalCrawlMiddleware. + + The function attempts to obtain the project ID in the following order: + 1. For Scrapy Cloud deployments, the project ID is automatically set as SCRAPY_PROJECT_ID + in the environment variables. + 2. Otherwise, it checks the ZYTE_PROJECT_ID environment variable. + 3. If still not found, it checks the spider setting named ZYTE_PROJECT_ID. + + """ + + if project_id := os.environ.get("SCRAPY_PROJECT_ID"): + logger.info( + f"Picked project id {project_id} from SCRAPY_PROJECT_ID env variable." + ) + return project_id + # Try to pick from manually set environmental variable + if project_id := os.environ.get("ZYTE_PROJECT_ID"): + logger.info( + f"Picked project id {project_id} from ZYTE_PROJECT_ID env variable." + ) + return project_id + # Try to pick from settings + if project_id := crawler.settings.get("ZYTE_PROJECT_ID"): + logger.info( + f"Picked project id {project_id} from the spider's ZYTE_PROJECT_ID setting." + ) + return project_id + raise ValueError( + "Zyte project id wasn't found in job data, env, or settings. " + "The env variable SCRAPY_PROJECT_ID or settings property ZYTE_PROJECT_ID was expected." + ) + + +def get_spider_name(crawler: Crawler) -> str: + if spider_name := os.environ.get("SHUB_VIRTUAL_SPIDER"): + logger.info( + f"Picked virtual spider name {spider_name} from the spider's SHUB_VIRTUAL_SPIDER setting." + ) + return spider_name + + logger.info(f"Picked spider name {crawler.spider.name} from the spider.") # type: ignore[union-attr] + return crawler.spider.name # type: ignore[union-attr] From d5b8adc945c9db21a56af46c3ae72a2c85aaa26b Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 22:18:52 +0300 Subject: [PATCH 03/22] update api.rst --- docs/reference/api.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/reference/api.rst b/docs/reference/api.rst index c20416a..bb0c49c 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -5,6 +5,8 @@ API Spiders ======= +.. autoclass:: zyte_spider_templates.ArticleSpider + .. autoclass:: zyte_spider_templates.BaseSpider .. autoclass:: zyte_spider_templates.EcommerceSpider @@ -15,6 +17,10 @@ Spiders Pages ===== +.. autoclass:: zyte_spider_templates.pages.DefaultSearchRequestTemplatePage + +.. autoclass:: zyte_spider_templates.pages.HeuristicsArticleNavigationPage + .. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage From 98b4e9c9189d6cb38105bae7e85c32546be862e2 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Fri, 6 Dec 2024 23:03:06 +0300 Subject: [PATCH 04/22] add DefaultSearchRequestTemplatePage to pages api.rst --- docs/reference/index.rst | 0 zyte_spider_templates/pages/__init__.py | 1 + 2 files changed, 1 insertion(+) delete mode 100644 docs/reference/index.rst diff --git a/docs/reference/index.rst b/docs/reference/index.rst deleted file mode 100644 index e69de29..0000000 diff --git a/zyte_spider_templates/pages/__init__.py b/zyte_spider_templates/pages/__init__.py index 2867be8..24c5937 100644 --- a/zyte_spider_templates/pages/__init__.py +++ b/zyte_spider_templates/pages/__init__.py @@ -1,2 +1,3 @@ from .article_heuristics import HeuristicsArticleNavigationPage from .product_navigation_heuristics import HeuristicsProductNavigationPage +from .search_request_template import DefaultSearchRequestTemplatePage From e04c7b3d64ab034a152e0eca4e61a103b89bd525 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Sun, 8 Dec 2024 20:40:06 +0300 Subject: [PATCH 05/22] add tests for coverage utils.py --- tests/test_utils.py | 142 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 0faaf86..c765808 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,8 +1,17 @@ +import logging +import os +from unittest.mock import patch + import pytest +from scrapy import Request, Spider +from tests import get_crawler from zyte_spider_templates.utils import ( get_domain, get_domain_fingerprint, + get_project_id, + get_request_fingerprint, + get_spider_name, load_url_list, ) @@ -70,3 +79,136 @@ def test_load_url_list(input_urls, expected): ) def test_get_domain_fingerprint(url, expected_fingerprint): assert get_domain_fingerprint(url) == expected_fingerprint + + +@pytest.mark.parametrize( + "env_var_value, spider_name, expected_result, expected_log", + [ + ( + "virtual_spider_name", + "regular_spider_name", + "virtual_spider_name", + "Picked virtual spider name virtual_spider_name from the spider's SHUB_VIRTUAL_SPIDER setting.", + ), + ( + None, + "regular_spider_name", + "regular_spider_name", + "Picked spider name regular_spider_name from the spider.", + ), + ], +) +def test_get_spider_name( + env_var_value, spider_name, expected_result, expected_log, caplog +): + class TestSpider(Spider): + name = spider_name + + caplog.clear() + crawler = get_crawler() + crawler.spider = TestSpider() + + logger = logging.getLogger("zyte_spider_templates.utils") + logger.setLevel(logging.INFO) + + with patch.dict( + os.environ, + {"SHUB_VIRTUAL_SPIDER": env_var_value} if env_var_value else {}, + clear=True, + ): + result = get_spider_name(crawler) + assert result == expected_result + assert expected_log in caplog.text + + +@pytest.mark.parametrize( + "env_scrapy, env_zyte, settings_zyte, expected_result, expected_log, expect_exception", + [ + # SCRAPY_PROJECT_ID is set + ( + "123456", + None, + None, + "123456", + "Picked project id 123456 from SCRAPY_PROJECT_ID env variable.", + False, + ), + # ZYTE_PROJECT_ID is set in the environment + ( + None, + "654321", + None, + "654321", + "Picked project id 654321 from ZYTE_PROJECT_ID env variable.", + False, + ), + # ZYTE_PROJECT_ID is set in the settings + ( + None, + None, + "126534", + "126534", + "Picked project id 126534 from the spider's ZYTE_PROJECT_ID setting.", + False, + ), + # No project ID found, expect an exception + ( + None, + None, + None, + None, # No result expected + None, # No log expected + True, # Expect an exception + ), + ], +) +def test_get_project_id( + env_scrapy, + env_zyte, + settings_zyte, + expected_result, + expected_log, + expect_exception, + caplog, +): + caplog.clear() + + env_vars = {} + if env_scrapy: + env_vars["SCRAPY_PROJECT_ID"] = env_scrapy + if env_zyte: + env_vars["ZYTE_PROJECT_ID"] = env_zyte + + with patch.dict(os.environ, env_vars, clear=True): + crawler = get_crawler() + + if settings_zyte: + crawler.settings.set("ZYTE_PROJECT_ID", settings_zyte) + + with caplog.at_level(logging.INFO, logger="zyte_spider_templates.utils"): + if expect_exception: + with pytest.raises( + ValueError, + match="Zyte project id wasn't found in job data, env, or settings.", + ): + get_project_id(crawler) + else: + assert get_project_id(crawler) == expected_result + assert expected_log in caplog.text + + +def test_get_request_fingerprint(): + url = "https://example.com" + domain_fp = "ffeeddccbbaa" + request_fp = "aabbccddeeff" + + with patch( + "zyte_spider_templates.utils.get_domain_fingerprint", return_value=domain_fp + ): + crawler = get_crawler() + with patch.object(crawler, "request_fingerprinter") as mock_fingerprinter: + mock_fingerprinter.fingerprint.return_value = bytes.fromhex(request_fp) + request = Request(url) + result = get_request_fingerprint(crawler, request) + assert result == domain_fp + request_fp + mock_fingerprinter.fingerprint.assert_called_once_with(request) From 2623342542c2150ce897160c9604d296b5aa3a2b Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Sun, 8 Dec 2024 22:42:18 +0300 Subject: [PATCH 06/22] add tests for coverage PageParamsMiddlewareBase --- tests/test_middlewares.py | 88 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 9aa4a7b..7f32a27 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -20,6 +20,7 @@ CrawlingLogsMiddleware, MaxRequestsPerSeedDownloaderMiddleware, OffsiteRequestsPerSeedMiddleware, + PageParamsMiddlewareBase, TrackSeedsSpiderMiddleware, ) @@ -1358,3 +1359,90 @@ class TestSpider(Spider): OffsiteRequestsPerSeedMiddleware.from_crawler(crawler=crawler), OffsiteRequestsPerSeedMiddleware, ) + + +def test_page_params_middleware_base_update_page_params(): + request_url = "https://example.com/1" + request = Request(request_url) + crawler = get_crawler() + middleware = PageParamsMiddlewareBase(crawler) + assert middleware.update_page_params(request, {}) is None + + +def test_page_params_middleware_base__update_page_params(): + request_url = "https://example.com/1" + request = Request(request_url) + crawler = get_crawler() + middleware = PageParamsMiddlewareBase(crawler) + assert middleware._update_page_params(request) is None + assert "page_params" in request.meta + assert request.meta["page_params"] == {} + + request = Request(request_url, meta={"page_params": {"test": 1}}) + crawler = get_crawler() + middleware = PageParamsMiddlewareBase(crawler) + assert middleware._update_page_params(request) is None + assert "page_params" in request.meta + assert request.meta["page_params"] == {"test": 1} + + +def test_page_params_middleware_base(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + + request_url = "https://example.com/1" + request = Request(request_url) + item = Article(url="https://example.com/article") + response = Response(url=request_url, request=request) + middleware = PageParamsMiddlewareBase(crawler) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert result[0].meta["page_params"] == {} + + request = Request(url=request_url) + result = list(middleware.process_start_requests([request], crawler.spider)) + assert result[0].meta["page_params"] == {} + + request = Request(request_url, meta={"page_params": {"test": 1}}) + response = Response(url=request_url, request=request) + middleware = PageParamsMiddlewareBase(crawler) + result = list( + middleware.process_spider_output(response, [request, item], crawler.spider) + ) + assert result[0].meta["page_params"] == {"test": 1} + + result = list(middleware.process_start_requests([request], crawler.spider)) + assert result[0].meta["page_params"] == {"test": 1} + + +@pytest.mark.asyncio +async def test_page_params_middleware_base_async(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler() + crawler.spider = TestSpider() + + request_url = "https://example.com/1" + request = Request(request_url) + item = Article(url="https://example.com/article") + response = Response(url=request_url, request=request) + middleware = PageParamsMiddlewareBase(crawler) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert result[0].meta["page_params"] == {} + assert result[1] == item + + request = Request(request_url, meta={"page_params": {"test": 1}}) + response = Response(url=request_url, request=request) + middleware = PageParamsMiddlewareBase(crawler) + result = await result_as_async_gen( + middleware, response, [request, item], crawler.spider + ) + assert result[0].meta["page_params"] == {"test": 1} + assert result[1] == item From 1045617044093d82e51fa64740a991c21bbc603c Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 08:48:41 +0300 Subject: [PATCH 07/22] add tests for OnlyFeedsMiddleware and PageParamsMiddlewareBase --- tests/test_middlewares.py | 76 ++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 7f32a27..1f9dea7 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -20,10 +20,13 @@ CrawlingLogsMiddleware, MaxRequestsPerSeedDownloaderMiddleware, OffsiteRequestsPerSeedMiddleware, + OnlyFeedsMiddleware, PageParamsMiddlewareBase, TrackSeedsSpiderMiddleware, ) +from . import get_crawler as get_crawler_with_settings + def get_fingerprinter(crawler): return lambda request: crawler.request_fingerprinter.fingerprint(request).hex() @@ -1361,31 +1364,6 @@ class TestSpider(Spider): ) -def test_page_params_middleware_base_update_page_params(): - request_url = "https://example.com/1" - request = Request(request_url) - crawler = get_crawler() - middleware = PageParamsMiddlewareBase(crawler) - assert middleware.update_page_params(request, {}) is None - - -def test_page_params_middleware_base__update_page_params(): - request_url = "https://example.com/1" - request = Request(request_url) - crawler = get_crawler() - middleware = PageParamsMiddlewareBase(crawler) - assert middleware._update_page_params(request) is None - assert "page_params" in request.meta - assert request.meta["page_params"] == {} - - request = Request(request_url, meta={"page_params": {"test": 1}}) - crawler = get_crawler() - middleware = PageParamsMiddlewareBase(crawler) - assert middleware._update_page_params(request) is None - assert "page_params" in request.meta - assert request.meta["page_params"] == {"test": 1} - - def test_page_params_middleware_base(): class TestSpider(Spider): name = "test" @@ -1401,11 +1379,11 @@ class TestSpider(Spider): result = list( middleware.process_spider_output(response, [request, item], crawler.spider) ) - assert result[0].meta["page_params"] == {} + assert result[0].meta["page_params"] == {} # type: ignore[union-attr] request = Request(url=request_url) result = list(middleware.process_start_requests([request], crawler.spider)) - assert result[0].meta["page_params"] == {} + assert result[0].meta["page_params"] == {} # type: ignore[union-attr] request = Request(request_url, meta={"page_params": {"test": 1}}) response = Response(url=request_url, request=request) @@ -1413,10 +1391,10 @@ class TestSpider(Spider): result = list( middleware.process_spider_output(response, [request, item], crawler.spider) ) - assert result[0].meta["page_params"] == {"test": 1} + assert result[0].meta["page_params"] == {"test": 1} # type: ignore[union-attr] result = list(middleware.process_start_requests([request], crawler.spider)) - assert result[0].meta["page_params"] == {"test": 1} + assert result[0].meta["page_params"] == {"test": 1} # type: ignore[union-attr] @pytest.mark.asyncio @@ -1427,6 +1405,7 @@ class TestSpider(Spider): crawler = get_crawler() crawler.spider = TestSpider() + # Default page_params value request_url = "https://example.com/1" request = Request(request_url) item = Article(url="https://example.com/article") @@ -1438,6 +1417,7 @@ class TestSpider(Spider): assert result[0].meta["page_params"] == {} assert result[1] == item + # Explicit page_params in request meta request = Request(request_url, meta={"page_params": {"test": 1}}) response = Response(url=request_url, request=request) middleware = PageParamsMiddlewareBase(crawler) @@ -1446,3 +1426,41 @@ class TestSpider(Spider): ) assert result[0].meta["page_params"] == {"test": 1} assert result[1] == item + + +def test_only_feeds_middleware(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler_with_settings() + crawler.spider = TestSpider() + crawler.spider.settings = Settings({}) + + # ONLY_FEEDS_ENABLED = True + crawler.spider.settings.set("ONLY_FEEDS_ENABLED", True) + middleware = OnlyFeedsMiddleware(crawler) + assert middleware is not None + + # ONLY_FEEDS_ENABLED = False + crawler.spider.settings.set("ONLY_FEEDS_ENABLED", False) + with pytest.raises(NotConfigured): + OnlyFeedsMiddleware(crawler) + + # Explicit only_feeds in request meta + crawler.spider.settings.set("ONLY_FEEDS_ENABLED", True) + middleware = OnlyFeedsMiddleware(crawler) + + request_url = "https://example.com/1" + request = Request(request_url, meta={"only_feeds": False}) + + page_params = {} + middleware.update_page_params(request, page_params) + + assert page_params["only_feeds"] is False + + # Default only_feeds value + request = Request(request_url) + page_params = {} + middleware.update_page_params(request, page_params) + + assert page_params["only_feeds"] is True From 124e12350f3878d12f29cd5cb5680a98a8bd2b04 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 08:54:21 +0300 Subject: [PATCH 08/22] add tests for DummyDupeFilter --- tests/test_middlewares.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 1f9dea7..fdbe3a8 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -18,13 +18,13 @@ from zyte_spider_templates.middlewares import ( AllowOffsiteMiddleware, CrawlingLogsMiddleware, + DummyDupeFilter, MaxRequestsPerSeedDownloaderMiddleware, OffsiteRequestsPerSeedMiddleware, OnlyFeedsMiddleware, PageParamsMiddlewareBase, TrackSeedsSpiderMiddleware, ) - from . import get_crawler as get_crawler_with_settings @@ -1464,3 +1464,11 @@ class TestSpider(Spider): middleware.update_page_params(request, page_params) assert page_params["only_feeds"] is True + + +def test_dummy_dupe_filter(): + request_url = "https://example.com/1" + request = Request(request_url) + middleware = DummyDupeFilter() + assert middleware.request_seen(request) is False + From 80c7022c175b4e364e87e7db88d161b5288fb1ee Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 09:06:09 +0300 Subject: [PATCH 09/22] add from_crawler for TrackSeedsSpiderMiddleware test --- tests/test_middlewares.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index fdbe3a8..79541fb 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1218,6 +1218,8 @@ class TestSpider(Spider): crawler.spider = TestSpider() crawler.stats = StatsCollector(crawler) middleware = TrackSeedsSpiderMiddleware(crawler) + assert isinstance(middleware.from_crawler(crawler), TrackSeedsSpiderMiddleware) + start_request_url = "https://example.com/1" start_request = Request(url=start_request_url, meta=meta) result = list(middleware.process_start_requests([start_request], TestSpider())) From 492a7bebb1f64843ebc094cf308426ab9e8b4266 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 09:06:30 +0300 Subject: [PATCH 10/22] add from_crawler for PageParamsMiddlewareBase test --- tests/test_middlewares.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 79541fb..2aba146 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1378,6 +1378,9 @@ class TestSpider(Spider): item = Article(url="https://example.com/article") response = Response(url=request_url, request=request) middleware = PageParamsMiddlewareBase(crawler) + assert middleware.crawler == crawler + assert isinstance(middleware.from_crawler(crawler), PageParamsMiddlewareBase) + result = list( middleware.process_spider_output(response, [request, item], crawler.spider) ) From 758ec6fde4f00c0fe50c15d4796d6e589890a3c1 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 10:02:20 +0300 Subject: [PATCH 11/22] add tests for DupeFilterSpiderMiddleware + tune others --- tests/test_middlewares.py | 138 +++++++++++++++++++++++++++++++++----- 1 file changed, 123 insertions(+), 15 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 2aba146..c7b6be8 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -19,6 +19,7 @@ AllowOffsiteMiddleware, CrawlingLogsMiddleware, DummyDupeFilter, + DupeFilterSpiderMiddleware, MaxRequestsPerSeedDownloaderMiddleware, OffsiteRequestsPerSeedMiddleware, OnlyFeedsMiddleware, @@ -1381,25 +1382,29 @@ class TestSpider(Spider): assert middleware.crawler == crawler assert isinstance(middleware.from_crawler(crawler), PageParamsMiddlewareBase) - result = list( + processed_output = list( middleware.process_spider_output(response, [request, item], crawler.spider) ) - assert result[0].meta["page_params"] == {} # type: ignore[union-attr] + assert processed_output[0].meta["page_params"] == {} # type: ignore[union-attr] request = Request(url=request_url) - result = list(middleware.process_start_requests([request], crawler.spider)) - assert result[0].meta["page_params"] == {} # type: ignore[union-attr] + processed_output = list( + middleware.process_start_requests([request], crawler.spider) + ) + assert processed_output[0].meta["page_params"] == {} # type: ignore[union-attr] request = Request(request_url, meta={"page_params": {"test": 1}}) response = Response(url=request_url, request=request) middleware = PageParamsMiddlewareBase(crawler) - result = list( + processed_output = list( middleware.process_spider_output(response, [request, item], crawler.spider) ) - assert result[0].meta["page_params"] == {"test": 1} # type: ignore[union-attr] + assert processed_output[0].meta["page_params"] == {"test": 1} # type: ignore[union-attr] - result = list(middleware.process_start_requests([request], crawler.spider)) - assert result[0].meta["page_params"] == {"test": 1} # type: ignore[union-attr] + processed_output = list( + middleware.process_start_requests([request], crawler.spider) + ) + assert processed_output[0].meta["page_params"] == {"test": 1} # type: ignore[union-attr] @pytest.mark.asyncio @@ -1416,21 +1421,21 @@ class TestSpider(Spider): item = Article(url="https://example.com/article") response = Response(url=request_url, request=request) middleware = PageParamsMiddlewareBase(crawler) - result = await result_as_async_gen( + processed_output = await result_as_async_gen( middleware, response, [request, item], crawler.spider ) - assert result[0].meta["page_params"] == {} - assert result[1] == item + assert processed_output[0].meta["page_params"] == {} + assert processed_output[1] == item # Explicit page_params in request meta request = Request(request_url, meta={"page_params": {"test": 1}}) response = Response(url=request_url, request=request) middleware = PageParamsMiddlewareBase(crawler) - result = await result_as_async_gen( + processed_output = await result_as_async_gen( middleware, response, [request, item], crawler.spider ) - assert result[0].meta["page_params"] == {"test": 1} - assert result[1] == item + assert processed_output[0].meta["page_params"] == {"test": 1} + assert processed_output[1] == item def test_only_feeds_middleware(): @@ -1458,7 +1463,7 @@ class TestSpider(Spider): request_url = "https://example.com/1" request = Request(request_url, meta={"only_feeds": False}) - page_params = {} + page_params: dict = {} middleware.update_page_params(request, page_params) assert page_params["only_feeds"] is False @@ -1477,3 +1482,106 @@ def test_dummy_dupe_filter(): middleware = DummyDupeFilter() assert middleware.request_seen(request) is False + +def test_dupe_filter_spider_middleware(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler_with_settings() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + item = Article(url="https://example.com/article") + + middleware = DupeFilterSpiderMiddleware(crawler) + assert middleware.crawler == crawler + assert isinstance(middleware.from_crawler(crawler), DupeFilterSpiderMiddleware) + + # Test process_start_requests + start_requests = [ + Request(url="https://example.com/1"), + Request(url="https://example.com/2"), + ] + processed_requests = list( + middleware.process_start_requests(start_requests, crawler.spider) + ) + assert len(processed_requests) == 2 + + # Simulate duplicate request + start_requests = [ + Request(url="https://example.com/1"), + Request(url="https://example.com/3"), + ] + processed_requests = list( + middleware.process_start_requests(start_requests, crawler.spider) + ) + assert len(processed_requests) == 1 + assert processed_requests[0].url == "https://example.com/3" + + # Test process_spider_output + response = Response(url="https://example.com/1") + result = [ + Request(url="https://example.com/4"), + item, + Request(url="https://example.com/1"), + ] + processed_output = list( + middleware.process_spider_output(response, result, crawler.spider) + ) + assert len(processed_output) == 2 + assert processed_output[0].url == "https://example.com/4" # type: ignore[union-attr] + assert processed_output[1] == item + + +@pytest.mark.asyncio +async def test_dupe_filter_spider_middleware_async(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler_with_settings() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + item = Article(url="https://example.com/article") + + middleware = DupeFilterSpiderMiddleware(crawler) + assert middleware.crawler == crawler + assert isinstance(middleware.from_crawler(crawler), DupeFilterSpiderMiddleware) + + # Test process_start_requests + start_requests = [ + Request(url="https://example.com/1"), + Request(url="https://example.com/2"), + ] + processed_requests = list( + middleware.process_start_requests(start_requests, crawler.spider) + ) + assert len(processed_requests) == 2 + + # Simulate duplicate request + start_requests = [ + Request(url="https://example.com/1"), + Request(url="https://example.com/3"), + ] + processed_requests = list( + middleware.process_start_requests(start_requests, crawler.spider) + ) + assert len(processed_requests) == 1 + assert processed_requests[0].url == "https://example.com/3" + + # Test process_spider_output_async + response = Response(url="https://example.com/1") + processed_output = await result_as_async_gen( + middleware, + response, + [ + Request(url="https://example.com/4"), + item, + Request(url="https://example.com/1"), + ], + crawler.spider, + ) + + assert len(processed_output) == 2 + if processed_output[0].url == "https://example.com/4": + assert processed_output[1] == item + else: + assert processed_output[1] == "https://example.com/4" From 61fab4e3e31110f0a88279ad6fba261567be9de5 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 10:36:18 +0300 Subject: [PATCH 12/22] fix async test for DupeFilterSpiderMiddleware --- tests/test_middlewares.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index c7b6be8..72dcfd2 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -26,6 +26,7 @@ PageParamsMiddlewareBase, TrackSeedsSpiderMiddleware, ) + from . import get_crawler as get_crawler_with_settings @@ -1543,13 +1544,14 @@ class TestSpider(Spider): item = Article(url="https://example.com/article") middleware = DupeFilterSpiderMiddleware(crawler) + assert middleware.crawler == crawler assert isinstance(middleware.from_crawler(crawler), DupeFilterSpiderMiddleware) # Test process_start_requests start_requests = [ - Request(url="https://example.com/1"), - Request(url="https://example.com/2"), + Request(url="https://example.com/11"), + Request(url="https://example.com/21"), ] processed_requests = list( middleware.process_start_requests(start_requests, crawler.spider) @@ -1558,30 +1560,28 @@ class TestSpider(Spider): # Simulate duplicate request start_requests = [ - Request(url="https://example.com/1"), - Request(url="https://example.com/3"), + Request(url="https://example.com/11"), + Request(url="https://example.com/31"), ] processed_requests = list( middleware.process_start_requests(start_requests, crawler.spider) ) assert len(processed_requests) == 1 - assert processed_requests[0].url == "https://example.com/3" + assert processed_requests[0].url == "https://example.com/31" # Test process_spider_output_async - response = Response(url="https://example.com/1") + response = Response(url="https://example.com/11") processed_output = await result_as_async_gen( middleware, response, [ - Request(url="https://example.com/4"), + Request(url="https://example.com/41"), item, - Request(url="https://example.com/1"), + Request(url="https://example.com/11"), ], crawler.spider, ) assert len(processed_output) == 2 - if processed_output[0].url == "https://example.com/4": - assert processed_output[1] == item - else: - assert processed_output[1] == "https://example.com/4" + assert processed_output[0].url == "https://example.com/41" + assert processed_output[1] == item From b24de1e0915ddb5392ad71fdbe6e522d51242aa9 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 10:46:15 +0300 Subject: [PATCH 13/22] add test for from_crawler for TrackSeedsSpiderMiddleware, MaxRequestsPerSeedDownloaderMiddleware --- tests/test_middlewares.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 72dcfd2..8594718 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -455,6 +455,10 @@ def test_process_request(): crawler = _get_seed_crawler() spider_middleware = TrackSeedsSpiderMiddleware(crawler) downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) + assert downloader_middleware.crawler == crawler + assert isinstance(downloader_middleware.from_crawler(crawler), MaxRequestsPerSeedDownloaderMiddleware) + assert isinstance(spider_middleware.from_crawler(crawler), TrackSeedsSpiderMiddleware) + request_gen: Iterable[Union[Request, Item]] request: Union[Request, Item] From 8ad76ce8748b5fba8207dc6ccc0b288f7e9081e2 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 11:49:51 +0300 Subject: [PATCH 14/22] add test for TrackNavigationDepthSpiderMiddleware --- tests/test_middlewares.py | 100 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 3 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 8594718..0ae9aad 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -24,9 +24,9 @@ OffsiteRequestsPerSeedMiddleware, OnlyFeedsMiddleware, PageParamsMiddlewareBase, + TrackNavigationDepthSpiderMiddleware, TrackSeedsSpiderMiddleware, ) - from . import get_crawler as get_crawler_with_settings @@ -456,8 +456,13 @@ def test_process_request(): spider_middleware = TrackSeedsSpiderMiddleware(crawler) downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler) assert downloader_middleware.crawler == crawler - assert isinstance(downloader_middleware.from_crawler(crawler), MaxRequestsPerSeedDownloaderMiddleware) - assert isinstance(spider_middleware.from_crawler(crawler), TrackSeedsSpiderMiddleware) + assert isinstance( + downloader_middleware.from_crawler(crawler), + MaxRequestsPerSeedDownloaderMiddleware, + ) + assert isinstance( + spider_middleware.from_crawler(crawler), TrackSeedsSpiderMiddleware + ) request_gen: Iterable[Union[Request, Item]] request: Union[Request, Item] @@ -1589,3 +1594,92 @@ class TestSpider(Spider): assert len(processed_output) == 2 assert processed_output[0].url == "https://example.com/41" assert processed_output[1] == item + + +def test_track_navigation_depth_spider_middleware(): + class TestSpider(Spider): + name = "test" + + crawler = get_crawler_with_settings() + crawler.spider = TestSpider() + crawler.stats = StatsCollector(crawler) + crawler.spider.settings = Settings({}) + request_url_1 = "https://example.com/1" + request_url_2 = "https://example.com/2" + item = Article(url="https://example.com/article") + + # NAVIGATION_DEPTH_LIMIT = 1 + crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1) + middleware = TrackNavigationDepthSpiderMiddleware(crawler) + assert middleware is not None + assert middleware.max_navigation_depth == 1 + + assert isinstance( + middleware.from_crawler(crawler), TrackNavigationDepthSpiderMiddleware + ) + + # NAVIGATION_DEPTH_LIMIT = 0 + crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 0) + with pytest.raises(NotConfigured): + TrackNavigationDepthSpiderMiddleware(crawler) + + # Explicit final_navigation_page in request meta + crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1) + middleware = TrackNavigationDepthSpiderMiddleware(crawler) + + request = Request(request_url_1, meta={"final_navigation_page": True}) + page_params: dict = {} + middleware.update_page_params(request, page_params) + assert page_params["skip_subcategories"] is True + + # Default final_navigation_page value + request = Request(request_url_1) + page_params = {} + middleware.update_page_params(request, page_params) + assert page_params["skip_subcategories"] is None + + # Test process_start_requests with NAVIGATION_DEPTH_LIMIT = 1 + crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1) + middleware = TrackNavigationDepthSpiderMiddleware(crawler) + processed_requests = list( + middleware.process_start_requests( + [Request(url=request_url_1), Request(url=request_url_2)], crawler.spider + ) + ) + assert len(processed_requests) == 2 + for i in (0, 1): + assert processed_requests[i].meta["final_navigation_page"] is True + assert processed_requests[i].meta["navigation_depth"] == 1 + assert processed_requests[i].meta["page_params"] == {"skip_subcategories": None} + + # Test process_start_requests with NAVIGATION_DEPTH_LIMIT = 2 + crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 2) + middleware = TrackNavigationDepthSpiderMiddleware(crawler) + processed_requests = list( + middleware.process_start_requests( + [Request(url=request_url_1), Request(url=request_url_2)], crawler.spider + ) + ) + assert len(processed_requests) == 2 + for i in (0, 1): + assert processed_requests[i].meta["final_navigation_page"] is False + assert processed_requests[i].meta["navigation_depth"] == 1 + assert processed_requests[i].meta["page_params"] == {"skip_subcategories": None} + + # Test process_spider_output + crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1) + middleware = TrackNavigationDepthSpiderMiddleware(crawler) + + response = Response(url=request_url_1, request=Request(url=request_url_1, meta={})) + result = [ + Request(url=request_url_1, meta={}), + item, + Request(url=request_url_2, meta={}), + ] + processed_output = list( + middleware.process_spider_output(response, result, crawler.spider) + ) + assert len(processed_output) == 3 + assert processed_output[0].url == request_url_1 # type: ignore[union-attr] + assert processed_output[1] == item + assert processed_output[2].url == request_url_2 # type: ignore[union-attr] From 728f84e5a04dc908862c6f3c65f120745f41ebf5 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Mon, 9 Dec 2024 11:51:33 +0300 Subject: [PATCH 15/22] formatting --- tests/test_middlewares.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 0ae9aad..8eaccdb 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -27,6 +27,7 @@ TrackNavigationDepthSpiderMiddleware, TrackSeedsSpiderMiddleware, ) + from . import get_crawler as get_crawler_with_settings From 970743bb06c4464a483cbb39528dc3c355434b47 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Thu, 12 Dec 2024 18:59:43 +0300 Subject: [PATCH 16/22] fix auth issue on staging --- zyte_spider_templates/_incremental/manager.py | 4 ++-- zyte_spider_templates/utils.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/zyte_spider_templates/_incremental/manager.py b/zyte_spider_templates/_incremental/manager.py index 275c0b3..6e4ff28 100644 --- a/zyte_spider_templates/_incremental/manager.py +++ b/zyte_spider_templates/_incremental/manager.py @@ -13,6 +13,7 @@ from zyte_common_items import Item from zyte_spider_templates.utils import ( + get_client, get_project_id, get_request_fingerprint, get_spider_name, @@ -57,8 +58,7 @@ def get_collection_name(self, crawler): ) def init_collection(self, project_id, collection_name) -> None: - # auth is taken from SH_APIKEY or SHUB_JOBAUTH - client = scrapinghub.ScrapinghubClient() + client = get_client() collection = client.get_project(project_id).collections.get_store( collection_name ) diff --git a/zyte_spider_templates/utils.py b/zyte_spider_templates/utils.py index c17e4ed..5c5f709 100644 --- a/zyte_spider_templates/utils.py +++ b/zyte_spider_templates/utils.py @@ -4,6 +4,7 @@ import re from typing import List, Optional +import scrapinghub import tldextract from scrapy.crawler import Crawler from scrapy.http import Request @@ -114,3 +115,11 @@ def get_spider_name(crawler: Crawler) -> str: logger.info(f"Picked spider name {crawler.spider.name} from the spider.") # type: ignore[union-attr] return crawler.spider.name # type: ignore[union-attr] + + +def get_client() -> scrapinghub.ScrapinghubClient: + # auth is taken from SH_APIKEY or SHUB_JOBAUTH + return scrapinghub.ScrapinghubClient( + dash_endpoint=os.getenv("SHUB_APIURL"), + endpoint=os.getenv("SHUB_STORAGE"), + ) From 951bcd749fd52d86978473656d39195cc0998f88 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Thu, 12 Dec 2024 19:12:54 +0300 Subject: [PATCH 17/22] test --- zyte_spider_templates/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zyte_spider_templates/utils.py b/zyte_spider_templates/utils.py index 5c5f709..435b4f4 100644 --- a/zyte_spider_templates/utils.py +++ b/zyte_spider_templates/utils.py @@ -119,6 +119,8 @@ def get_spider_name(crawler: Crawler) -> str: def get_client() -> scrapinghub.ScrapinghubClient: # auth is taken from SH_APIKEY or SHUB_JOBAUTH + # TODO TESTING + logger.info(f"SHUB_APIURL: {os.getenv('SHUB_APIURL')}, SHUB_STORAGE: {os.getenv('SHUB_STORAGE')}") # type: ignore[union-attr] return scrapinghub.ScrapinghubClient( dash_endpoint=os.getenv("SHUB_APIURL"), endpoint=os.getenv("SHUB_STORAGE"), From 68371f2196c84c5e6bb1b5dd1922b521d52c696e Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Thu, 12 Dec 2024 19:56:10 +0300 Subject: [PATCH 18/22] clean todo --- zyte_spider_templates/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/zyte_spider_templates/utils.py b/zyte_spider_templates/utils.py index 435b4f4..5c5f709 100644 --- a/zyte_spider_templates/utils.py +++ b/zyte_spider_templates/utils.py @@ -119,8 +119,6 @@ def get_spider_name(crawler: Crawler) -> str: def get_client() -> scrapinghub.ScrapinghubClient: # auth is taken from SH_APIKEY or SHUB_JOBAUTH - # TODO TESTING - logger.info(f"SHUB_APIURL: {os.getenv('SHUB_APIURL')}, SHUB_STORAGE: {os.getenv('SHUB_STORAGE')}") # type: ignore[union-attr] return scrapinghub.ScrapinghubClient( dash_endpoint=os.getenv("SHUB_APIURL"), endpoint=os.getenv("SHUB_STORAGE"), From 769bcd0e4f676839cbbc8027bf649f693e5dd689 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 13 Dec 2024 17:45:56 +0500 Subject: [PATCH 19/22] Only enable DropLowProbabilityItemPipeline for the articles spider. --- tests/test_addon.py | 8 -------- zyte_spider_templates/_addon.py | 2 -- zyte_spider_templates/spiders/article.py | 7 +++++++ 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/test_addon.py b/tests/test_addon.py index 0f9b04c..6ca9205 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -3,7 +3,6 @@ from duplicate_url_discarder_rules import RULE_PATHS from packaging import version from scrapy.utils.test import get_crawler -from zyte_common_items.pipelines import DropLowProbabilityItemPipeline from zyte_spider_templates import ( AllowOffsiteMiddleware, @@ -44,7 +43,6 @@ def _test_setting_changes(initial_settings, expected_settings): "DOWNLOADER_MIDDLEWARES", "SCRAPY_POET_PROVIDERS", "SPIDER_MIDDLEWARES", - "ITEM_PIPELINES", ): if setting not in crawler.settings: assert setting not in expected_settings @@ -91,9 +89,6 @@ def _test_setting_changes(initial_settings, expected_settings): TrackSeedsSpiderMiddleware: 550, CrawlingLogsMiddleware: 1000, }, - "ITEM_PIPELINES": { - DropLowProbabilityItemPipeline: 0, - }, "SPIDER_MODULES": [ "zyte_spider_templates.spiders", ], @@ -138,9 +133,6 @@ def test_poet_setting_changes_since_scrapy_2_11_2(initial_settings, expected_set TrackSeedsSpiderMiddleware: 550, CrawlingLogsMiddleware: 1000, }, - "ITEM_PIPELINES": { - DropLowProbabilityItemPipeline: 0, - }, "SPIDER_MODULES": [ "zyte_spider_templates.spiders", ], diff --git a/zyte_spider_templates/_addon.py b/zyte_spider_templates/_addon.py index 3f63d36..a4ed0a7 100644 --- a/zyte_spider_templates/_addon.py +++ b/zyte_spider_templates/_addon.py @@ -4,7 +4,6 @@ from duplicate_url_discarder_rules import RULE_PATHS from scrapy.settings import BaseSettings from scrapy.utils.misc import load_object -from zyte_common_items.pipelines import DropLowProbabilityItemPipeline from zyte_spider_templates import ( AllowOffsiteMiddleware, @@ -144,7 +143,6 @@ def update_settings(self, settings: BaseSettings) -> None: settings, "SPIDER_MIDDLEWARES", TrackNavigationDepthSpiderMiddleware, 110 ) _setdefault(settings, "SPIDER_MIDDLEWARES", CrawlingLogsMiddleware, 1000) - _setdefault(settings, "ITEM_PIPELINES", DropLowProbabilityItemPipeline, 0) try: from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware diff --git a/zyte_spider_templates/spiders/article.py b/zyte_spider_templates/spiders/article.py index 6b829d3..9833c05 100644 --- a/zyte_spider_templates/spiders/article.py +++ b/zyte_spider_templates/spiders/article.py @@ -9,6 +9,7 @@ from pydantic import BaseModel, ConfigDict, Field from scrapy.crawler import Crawler from scrapy.exceptions import CloseSpider +from scrapy.settings import BaseSettings from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from web_poet import BrowserResponse, HttpResponse @@ -18,6 +19,7 @@ ProbabilityMetadata, ProbabilityRequest, ) +from zyte_common_items.pipelines import DropLowProbabilityItemPipeline from zyte_spider_templates.documentation import document_enum from zyte_spider_templates.pages.article_heuristics import is_feed_request @@ -190,6 +192,11 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: return spider + @classmethod + def update_settings(cls, settings: BaseSettings) -> None: + super().update_settings(settings) + settings["ITEM_PIPELINES"][DropLowProbabilityItemPipeline] = 0 + def _init_input(self): urls_file = self.args.urls_file if urls_file: From ad6a58cd2d810cea54647e1bea918784d417f435 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 13 Dec 2024 21:11:50 +0500 Subject: [PATCH 20/22] add validation for incremental_collection_name --- docs/reference/settings.rst | 2 ++ tests/test_article.py | 5 +++-- zyte_spider_templates/spiders/article.py | 4 +++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index 9fc83f2..fc545fe 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -156,6 +156,8 @@ INCREMENTAL_CRAWL_COLLECTION_NAME :attr:`~zyte_spider_templates.spiders.article.ArticleSpiderParams.incremental_collection_name` command-line parameter instead of this setting. +.. note:: + Only ASCII alphanumeric characters and underscores are allowed. Default: `_incremental`. The current spider's name here will be virtual spider's name, if it's a virtual spider; diff --git a/tests/test_article.py b/tests/test_article.py index 19a01c1..fc47d9c 100644 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -313,7 +313,7 @@ def test_metadata(): "type": "boolean", }, "incremental_collection_name": { - "anyOf": [{"type": "string"}, {"type": "null"}], + "anyOf": [{"type": "string", "pattern": "^[a-zA-Z0-9_]+$"}, {"type": "null"}], "default": None, "description": "Name of the Zyte Scrapy Cloud Collection used during an incremental crawl." "By default, a Collection named after the spider (or virtual spider) is used, " @@ -321,7 +321,8 @@ def test_metadata(): "provided those previous runs had `incremental` argument set to `true`." "Using a different collection name makes sense, for example, in the following cases:" "- different spiders share a collection." - "- the same spider uses different collections (e.g., for development runs vs. production runs).", + "- the same spider uses different collections (e.g., for development runs vs. production runs). " + "Only ASCII alphanumeric characters and underscores are allowed in the collection name.", "title": "Incremental Collection Name", }, "crawl_strategy": { diff --git a/zyte_spider_templates/spiders/article.py b/zyte_spider_templates/spiders/article.py index 9833c05..f7dad2a 100644 --- a/zyte_spider_templates/spiders/article.py +++ b/zyte_spider_templates/spiders/article.py @@ -95,9 +95,11 @@ class IncrementalParam(BaseModel): "provided those previous runs had `incremental` argument set to `true`." "Using a different collection name makes sense, for example, in the following cases:" "- different spiders share a collection." - "- the same spider uses different collections (e.g., for development runs vs. production runs)." + "- the same spider uses different collections (e.g., for development runs vs. production runs). " + "Only ASCII alphanumeric characters and underscores are allowed in the collection name." ), default=None, + pattern="^[a-zA-Z0-9_]+$", ) From 0a6cad372a3302b18d888f1291b2c3d903730344 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 13 Dec 2024 21:16:41 +0500 Subject: [PATCH 21/22] fix formatting --- tests/test_article.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_article.py b/tests/test_article.py index fc47d9c..919812d 100644 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -313,7 +313,10 @@ def test_metadata(): "type": "boolean", }, "incremental_collection_name": { - "anyOf": [{"type": "string", "pattern": "^[a-zA-Z0-9_]+$"}, {"type": "null"}], + "anyOf": [ + {"type": "string", "pattern": "^[a-zA-Z0-9_]+$"}, + {"type": "null"}, + ], "default": None, "description": "Name of the Zyte Scrapy Cloud Collection used during an incremental crawl." "By default, a Collection named after the spider (or virtual spider) is used, " From 9f2ef4b66596044570cec0a28655a5f7d143f8be Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 13 Dec 2024 22:36:27 +0500 Subject: [PATCH 22/22] Mark Articles spider as experimental It should work fine, but we have less experience running it in production, as compared to e-commerce template. --- tests/test_article.py | 2 +- zyte_spider_templates/spiders/article.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_article.py b/tests/test_article.py index 919812d..9a492c3 100644 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -245,7 +245,7 @@ def test_metadata(): expected_metadata = { "template": True, "title": "Article", - "description": "Template for spiders that extract article data from news or blog websites.", + "description": "[Experimental] Template for spiders that extract article data from news or blog websites.", "param_schema": { "groups": [ { diff --git a/zyte_spider_templates/spiders/article.py b/zyte_spider_templates/spiders/article.py index f7dad2a..61a6be6 100644 --- a/zyte_spider_templates/spiders/article.py +++ b/zyte_spider_templates/spiders/article.py @@ -175,7 +175,7 @@ class ArticleSpider(Args[ArticleSpiderParams], BaseSpider): metadata: Dict[str, Any] = { **BaseSpider.metadata, "title": "Article", - "description": "Template for spiders that extract article data from news or blog websites.", + "description": "[Experimental] Template for spiders that extract article data from news or blog websites.", } @classmethod