Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/articles_to_main' into job-posti…
Browse files Browse the repository at this point in the history
…ng-product-list
  • Loading branch information
wRAR committed Dec 13, 2024
2 parents ea3ee4b + 8927379 commit 78c495c
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 12 deletions.
8 changes: 0 additions & 8 deletions tests/test_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from duplicate_url_discarder_rules import RULE_PATHS
from packaging import version
from scrapy.utils.test import get_crawler
from zyte_common_items.pipelines import DropLowProbabilityItemPipeline

from zyte_spider_templates import (
AllowOffsiteMiddleware,
Expand Down Expand Up @@ -44,7 +43,6 @@ def _test_setting_changes(initial_settings, expected_settings):
"DOWNLOADER_MIDDLEWARES",
"SCRAPY_POET_PROVIDERS",
"SPIDER_MIDDLEWARES",
"ITEM_PIPELINES",
):
if setting not in crawler.settings:
assert setting not in expected_settings
Expand Down Expand Up @@ -91,9 +89,6 @@ def _test_setting_changes(initial_settings, expected_settings):
TrackSeedsSpiderMiddleware: 550,
CrawlingLogsMiddleware: 1000,
},
"ITEM_PIPELINES": {
DropLowProbabilityItemPipeline: 0,
},
"SPIDER_MODULES": [
"zyte_spider_templates.spiders",
],
Expand Down Expand Up @@ -138,9 +133,6 @@ def test_poet_setting_changes_since_scrapy_2_11_2(initial_settings, expected_set
TrackSeedsSpiderMiddleware: 550,
CrawlingLogsMiddleware: 1000,
},
"ITEM_PIPELINES": {
DropLowProbabilityItemPipeline: 0,
},
"SPIDER_MODULES": [
"zyte_spider_templates.spiders",
],
Expand Down
99 changes: 99 additions & 0 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
OffsiteRequestsPerSeedMiddleware,
OnlyFeedsMiddleware,
PageParamsMiddlewareBase,
TrackNavigationDepthSpiderMiddleware,
TrackSeedsSpiderMiddleware,
)

Expand Down Expand Up @@ -506,6 +507,15 @@ def test_process_request():
crawler = _get_seed_crawler()
spider_middleware = TrackSeedsSpiderMiddleware(crawler)
downloader_middleware = MaxRequestsPerSeedDownloaderMiddleware(crawler)
assert downloader_middleware.crawler == crawler
assert isinstance(
downloader_middleware.from_crawler(crawler),
MaxRequestsPerSeedDownloaderMiddleware,
)
assert isinstance(
spider_middleware.from_crawler(crawler), TrackSeedsSpiderMiddleware
)

request_gen: Iterable[Union[Request, Item]]
request: Union[Request, Item]

Expand Down Expand Up @@ -1636,3 +1646,92 @@ class TestSpider(Spider):
assert len(processed_output) == 2
assert processed_output[0].url == "https://example.com/41"
assert processed_output[1] == item


def test_track_navigation_depth_spider_middleware():
class TestSpider(Spider):
name = "test"

crawler = get_crawler_with_settings()
crawler.spider = TestSpider()
crawler.stats = StatsCollector(crawler)
crawler.spider.settings = Settings({})
request_url_1 = "https://example.com/1"
request_url_2 = "https://example.com/2"
item = Article(url="https://example.com/article")

# NAVIGATION_DEPTH_LIMIT = 1
crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1)
middleware = TrackNavigationDepthSpiderMiddleware(crawler)
assert middleware is not None
assert middleware.max_navigation_depth == 1

assert isinstance(
middleware.from_crawler(crawler), TrackNavigationDepthSpiderMiddleware
)

# NAVIGATION_DEPTH_LIMIT = 0
crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 0)
with pytest.raises(NotConfigured):
TrackNavigationDepthSpiderMiddleware(crawler)

# Explicit final_navigation_page in request meta
crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1)
middleware = TrackNavigationDepthSpiderMiddleware(crawler)

request = Request(request_url_1, meta={"final_navigation_page": True})
page_params: dict = {}
middleware.update_page_params(request, page_params)
assert page_params["skip_subcategories"] is True

# Default final_navigation_page value
request = Request(request_url_1)
page_params = {}
middleware.update_page_params(request, page_params)
assert page_params["skip_subcategories"] is None

# Test process_start_requests with NAVIGATION_DEPTH_LIMIT = 1
crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1)
middleware = TrackNavigationDepthSpiderMiddleware(crawler)
processed_requests = list(
middleware.process_start_requests(
[Request(url=request_url_1), Request(url=request_url_2)], crawler.spider
)
)
assert len(processed_requests) == 2
for i in (0, 1):
assert processed_requests[i].meta["final_navigation_page"] is True
assert processed_requests[i].meta["navigation_depth"] == 1
assert processed_requests[i].meta["page_params"] == {"skip_subcategories": None}

# Test process_start_requests with NAVIGATION_DEPTH_LIMIT = 2
crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 2)
middleware = TrackNavigationDepthSpiderMiddleware(crawler)
processed_requests = list(
middleware.process_start_requests(
[Request(url=request_url_1), Request(url=request_url_2)], crawler.spider
)
)
assert len(processed_requests) == 2
for i in (0, 1):
assert processed_requests[i].meta["final_navigation_page"] is False
assert processed_requests[i].meta["navigation_depth"] == 1
assert processed_requests[i].meta["page_params"] == {"skip_subcategories": None}

# Test process_spider_output
crawler.spider.settings.set("NAVIGATION_DEPTH_LIMIT", 1)
middleware = TrackNavigationDepthSpiderMiddleware(crawler)

response = Response(url=request_url_1, request=Request(url=request_url_1, meta={}))
result = [
Request(url=request_url_1, meta={}),
item,
Request(url=request_url_2, meta={}),
]
processed_output = list(
middleware.process_spider_output(response, result, crawler.spider)
)
assert len(processed_output) == 3
assert processed_output[0].url == request_url_1 # type: ignore[union-attr]
assert processed_output[1] == item
assert processed_output[2].url == request_url_2 # type: ignore[union-attr]
2 changes: 0 additions & 2 deletions zyte_spider_templates/_addon.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from duplicate_url_discarder_rules import RULE_PATHS
from scrapy.settings import BaseSettings
from scrapy.utils.misc import load_object
from zyte_common_items.pipelines import DropLowProbabilityItemPipeline

from zyte_spider_templates import (
AllowOffsiteMiddleware,
Expand Down Expand Up @@ -144,7 +143,6 @@ def update_settings(self, settings: BaseSettings) -> None:
settings, "SPIDER_MIDDLEWARES", TrackNavigationDepthSpiderMiddleware, 110
)
_setdefault(settings, "SPIDER_MIDDLEWARES", CrawlingLogsMiddleware, 1000)
_setdefault(settings, "ITEM_PIPELINES", DropLowProbabilityItemPipeline, 0)

try:
from scrapy.downloadermiddlewares.offsite import OffsiteMiddleware
Expand Down
4 changes: 2 additions & 2 deletions zyte_spider_templates/_incremental/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from zyte_common_items import Item

from zyte_spider_templates.utils import (
get_client,
get_project_id,
get_request_fingerprint,
get_spider_name,
Expand Down Expand Up @@ -57,8 +58,7 @@ def get_collection_name(self, crawler):
)

def init_collection(self, project_id, collection_name) -> None:
# auth is taken from SH_APIKEY or SHUB_JOBAUTH
client = scrapinghub.ScrapinghubClient()
client = get_client()
collection = client.get_project(project_id).collections.get_store(
collection_name
)
Expand Down
7 changes: 7 additions & 0 deletions zyte_spider_templates/spiders/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pydantic import BaseModel, ConfigDict, Field
from scrapy.crawler import Crawler
from scrapy.exceptions import CloseSpider
from scrapy.settings import BaseSettings
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from web_poet import BrowserResponse, HttpResponse
Expand All @@ -18,6 +19,7 @@
ProbabilityMetadata,
ProbabilityRequest,
)
from zyte_common_items.pipelines import DropLowProbabilityItemPipeline

from zyte_spider_templates.documentation import document_enum
from zyte_spider_templates.pages.article_heuristics import is_feed_request
Expand Down Expand Up @@ -190,6 +192,11 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self:

return spider

@classmethod
def update_settings(cls, settings: BaseSettings) -> None:
super().update_settings(settings)
settings["ITEM_PIPELINES"][DropLowProbabilityItemPipeline] = 0

def _init_input(self):
urls_file = self.args.urls_file
if urls_file:
Expand Down
9 changes: 9 additions & 0 deletions zyte_spider_templates/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from typing import List, Optional

import scrapinghub
import tldextract
from scrapy.crawler import Crawler
from scrapy.http import Request
Expand Down Expand Up @@ -114,3 +115,11 @@ def get_spider_name(crawler: Crawler) -> str:

logger.info(f"Picked spider name {crawler.spider.name} from the spider.") # type: ignore[union-attr]
return crawler.spider.name # type: ignore[union-attr]


def get_client() -> scrapinghub.ScrapinghubClient:
# auth is taken from SH_APIKEY or SHUB_JOBAUTH
return scrapinghub.ScrapinghubClient(
dash_endpoint=os.getenv("SHUB_APIURL"),
endpoint=os.getenv("SHUB_STORAGE"),
)

0 comments on commit 78c495c

Please sign in to comment.