|
3 | 3 | from functools import partial
|
4 | 4 | from typing import Any, Callable, List, Optional, Sequence, Set
|
5 | 5 | from unittest import mock
|
| 6 | +from urllib.parse import urlparse |
6 | 7 |
|
7 | 8 | import attr
|
8 | 9 | import pytest
|
@@ -347,7 +348,7 @@ async def parse(self, response, page: ItemPage):
|
347 | 348 |
|
348 | 349 |
|
349 | 350 | @inlineCallbacks
|
350 |
| -def test_additional_requests_dont_filter() -> None: |
| 351 | +def test_additional_requests_dont_filter_duplicate() -> None: |
351 | 352 | """Verify that while duplicate regular requests are filtered out,
|
352 | 353 | additional requests are not (neither relative to the main requests not
|
353 | 354 | relative to each other).
|
@@ -392,6 +393,47 @@ async def parse(self, response, page: ItemPage):
|
392 | 393 | assert items == [{"a": "a"}]
|
393 | 394 |
|
394 | 395 |
|
| 396 | +@inlineCallbacks |
| 397 | +def test_additional_requests_dont_filter_offsite() -> None: |
| 398 | + pytest.importorskip("scrapy.downloadermiddlewares.offsite") |
| 399 | + |
| 400 | + items = [] |
| 401 | + |
| 402 | + with MockServer(EchoResource) as server: |
| 403 | + |
| 404 | + @attr.define |
| 405 | + class ItemPage(WebPage): |
| 406 | + http: HttpClient |
| 407 | + |
| 408 | + async def to_item(self): |
| 409 | + response1 = await self.http.request( |
| 410 | + server.root_url, |
| 411 | + body=b"a", |
| 412 | + ) |
| 413 | + # Not filtered out by the offsite middleware because it is an |
| 414 | + # additional request. |
| 415 | + response2 = await self.http.request("data:,b") |
| 416 | + return {response1.body.decode(): response2.body.decode()} |
| 417 | + |
| 418 | + class TestSpider(Spider): |
| 419 | + name = "test_spider" |
| 420 | + allowed_domains = [urlparse(server.root_url).hostname] |
| 421 | + |
| 422 | + def start_requests(self): |
| 423 | + yield Request(server.root_url, callback=self.parse) |
| 424 | + # Filtered out by the offsite middleware: |
| 425 | + yield Request("data:,", callback=self.parse) |
| 426 | + |
| 427 | + async def parse(self, response, page: ItemPage): |
| 428 | + item = await page.to_item() |
| 429 | + items.append(item) |
| 430 | + |
| 431 | + crawler = make_crawler(TestSpider) |
| 432 | + yield crawler.crawl() |
| 433 | + |
| 434 | + assert items == [{"a": "b"}] |
| 435 | + |
| 436 | + |
395 | 437 | @inlineCallbacks
|
396 | 438 | def test_additional_requests_no_cb_deps() -> None:
|
397 | 439 | # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/135
|
|
0 commit comments