Skip to content

Commit 7c079f0

Browse files
authored
Set dont_filter=True on additional requests (#199)
1 parent 045f3bf commit 7c079f0

File tree

2 files changed

+44
-2
lines changed

2 files changed

+44
-2
lines changed

scrapy_poet/downloader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ async def scrapy_downloader(request: HttpRequest):
2121
f"one of type: {type(request)!r}."
2222
)
2323

24-
scrapy_request = http_request_to_scrapy_request(request)
24+
scrapy_request = http_request_to_scrapy_request(request, dont_filter=True)
2525

2626
if scrapy_request.method == "HEAD":
2727
scrapy_request.meta["dont_redirect"] = True

tests/test_downloader.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import partial
44
from typing import Any, Callable, List, Optional, Sequence, Set
55
from unittest import mock
6+
from urllib.parse import urlparse
67

78
import attr
89
import pytest
@@ -347,7 +348,7 @@ async def parse(self, response, page: ItemPage):
347348

348349

349350
@inlineCallbacks
350-
def test_additional_requests_dont_filter() -> None:
351+
def test_additional_requests_dont_filter_duplicate() -> None:
351352
"""Verify that while duplicate regular requests are filtered out,
352353
additional requests are not (neither relative to the main requests not
353354
relative to each other).
@@ -392,6 +393,47 @@ async def parse(self, response, page: ItemPage):
392393
assert items == [{"a": "a"}]
393394

394395

396+
@inlineCallbacks
397+
def test_additional_requests_dont_filter_offsite() -> None:
398+
pytest.importorskip("scrapy.downloadermiddlewares.offsite")
399+
400+
items = []
401+
402+
with MockServer(EchoResource) as server:
403+
404+
@attr.define
405+
class ItemPage(WebPage):
406+
http: HttpClient
407+
408+
async def to_item(self):
409+
response1 = await self.http.request(
410+
server.root_url,
411+
body=b"a",
412+
)
413+
# Not filtered out by the offsite middleware because it is an
414+
# additional request.
415+
response2 = await self.http.request("data:,b")
416+
return {response1.body.decode(): response2.body.decode()}
417+
418+
class TestSpider(Spider):
419+
name = "test_spider"
420+
allowed_domains = [urlparse(server.root_url).hostname]
421+
422+
def start_requests(self):
423+
yield Request(server.root_url, callback=self.parse)
424+
# Filtered out by the offsite middleware:
425+
yield Request("data:,", callback=self.parse)
426+
427+
async def parse(self, response, page: ItemPage):
428+
item = await page.to_item()
429+
items.append(item)
430+
431+
crawler = make_crawler(TestSpider)
432+
yield crawler.crawl()
433+
434+
assert items == [{"a": "b"}]
435+
436+
395437
@inlineCallbacks
396438
def test_additional_requests_no_cb_deps() -> None:
397439
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/135

0 commit comments

Comments
 (0)