Skip to content

Commit

Permalink
Merge pull request #13 from Gallaecio/fix-deprecation
Browse files Browse the repository at this point in the history
Add CI and handle deprecation warnings
  • Loading branch information
kalessin authored Nov 29, 2024
2 parents efd3c7a + f8b4f4c commit 5b260db
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 47 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Tests
on: [push, pull_request]

jobs:
tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- python-version: "3.8"
env:
TOXENV: pinned
- python-version: "3.8"
env:
TOXENV: py
- python-version: "3.9"
env:
TOXENV: py

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Run tests
env: ${{ matrix.env }}
run: |
pip install -U tox
tox
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
[tool.black]
line-length = 120

[tool.pytest.ini_options]
filterwarnings = [
"ignore::DeprecationWarning:.*\\bfrontera\\b",
]
9 changes: 7 additions & 2 deletions scrapy_frontera/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from scrapy.http.request import Request as ScrapyRequest
from scrapy.http.response import Response as ScrapyResponse
from scrapy.utils.request import request_fingerprint

from w3lib.util import to_bytes, to_native_str

Expand All @@ -23,6 +22,12 @@ class RequestConverter(BaseRequestConverter):

def __init__(self, spider):
self.spider = spider
crawler = spider.crawler
if hasattr(crawler, "request_fingerprinter"):
self.request_fingerprint = crawler.request_fingerprinter.fingerprint
else:
from scrapy.utils.request import request_fingerprint
self.request_fingerprint = request_fingerprint

def to_frontier(self, scrapy_request):
"""request: Scrapy > Frontier"""
Expand Down Expand Up @@ -56,7 +61,7 @@ def to_frontier(self, scrapy_request):
fake_url = fingerprint_scrapy_request.url + str(uuid.uuid4())
fingerprint_scrapy_request = fingerprint_scrapy_request.replace(url=fake_url)
meta[b"frontier_fingerprint"] = scrapy_request.meta.get(
"frontier_fingerprint", request_fingerprint(fingerprint_scrapy_request)
"frontier_fingerprint", self.request_fingerprint(fingerprint_scrapy_request)
)
callback_slot_prefix_map = self.spider.crawler.settings.getdict("FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP")
frontier_slot_prefix_num_slots = callback_slot_prefix_map.get(get_callback_name(scrapy_request))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
packages = find_packages(),
install_requires=(
'frontera==0.7.1',
'scrapy',
'scrapy>=1.7.0',
),
classifiers = [
'Development Status :: 5 - Production/Stable',
Expand Down
131 changes: 87 additions & 44 deletions tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from twisted.internet import defer

from scrapy import Request, Spider
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
from scrapy.http import Response
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler
Expand All @@ -16,13 +17,14 @@
'DOWNLOADER_MIDDLEWARES': {
'scrapy_frontera.middlewares.SchedulerDownloaderMiddleware': 0,
},
'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
'SPIDER_MIDDLEWARES': {
'scrapy_frontera.middlewares.SchedulerSpiderMiddleware': 0,
},
}


class TestSpider(Spider):
class _TestSpider(Spider):
name = 'test'
success = False
success2 = False
Expand Down Expand Up @@ -55,7 +57,7 @@ def parse3(self, response):
self.success3 = True


class TestSpider2(Spider):
class _TestSpider2(Spider):
name = 'test'
success = False
success2 = False
Expand All @@ -71,7 +73,7 @@ def parse2(self, response):
self.success2 = True


class TestSpider3(Spider):
class _TestSpider3(Spider):
name = 'test'
success = 0

Expand Down Expand Up @@ -108,18 +110,33 @@ def tearDown(self):
while TestDownloadHandler.results:
TestDownloadHandler.results.pop()

@staticmethod
def setup_mocked_handler(mocked_handler, results=None):
handler = TestDownloadHandler()
if results:
handler.set_results(results)
if hasattr(HTTP11DownloadHandler, "from_crawler"):
mocked_handler.from_crawler.return_value = handler
else:
mocked_handler.return_value = handler


@defer.inlineCallbacks
def test_start_requests(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
Response(url='http://example2.com'),
],
)

with patch('frontera.contrib.backends.memory.MemoryBaseBackend.links_extracted') as mocked_links_extracted:
mocked_links_extracted.return_value = None
settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand All @@ -129,14 +146,18 @@ def test_start_requests(self):
@defer.inlineCallbacks
def test_cf_store(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com', body=b'cf_store'),
],
)

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand All @@ -145,8 +166,12 @@ def test_cf_store(self):
@defer.inlineCallbacks
def test_callback_requests_to_frontier(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
],
)

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -155,7 +180,7 @@ def test_callback_requests_to_frontier(self):
settings.setdict({
'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse2'],
})
crawler = get_crawler(TestSpider2, settings)
crawler = get_crawler(_TestSpider2, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand All @@ -165,9 +190,13 @@ def test_callback_requests_to_frontier(self):
@defer.inlineCallbacks
def test_callback_requests_to_frontier_with_implicit_callback(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
Response(url='http://example2.com'),
],
)

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -176,7 +205,7 @@ def test_callback_requests_to_frontier_with_implicit_callback(self):
settings.setdict({
'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'],
})
crawler = get_crawler(TestSpider3, settings)
crawler = get_crawler(_TestSpider3, settings)

yield self.runner.crawl(crawler)
self.assertEqual(crawler.spider.success, 1)
Expand All @@ -185,10 +214,9 @@ def test_callback_requests_to_frontier_with_implicit_callback(self):
@defer.inlineCallbacks
def test_callback_requests_slot_map(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
resp1 = Response(url='http://example.com')
resp2 = Response(url='http://example2.com')
mocked_handler.return_value.set_results([resp1, resp2])
self.setup_mocked_handler(mocked_handler, [resp1, resp2])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -198,7 +226,7 @@ def test_callback_requests_slot_map(self):
'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'],
'FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP': {'parse': 'myslot'},
})
crawler = get_crawler(TestSpider3, settings)
crawler = get_crawler(_TestSpider3, settings)

yield self.runner.crawl(crawler)
self.assertEqual(crawler.spider.success, 1)
Expand All @@ -210,10 +238,9 @@ def test_callback_requests_slot_map(self):
@defer.inlineCallbacks
def test_callback_requests_slot_map_with_num_slots(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
resp1 = Response(url='http://example.com')
resp2 = Response(url='http://example2.com')
mocked_handler.return_value.set_results([resp1, resp2])
self.setup_mocked_handler(mocked_handler, [resp1, resp2])

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
Expand All @@ -223,7 +250,7 @@ def test_callback_requests_slot_map_with_num_slots(self):
'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'],
'FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP': {'parse': 'myslot/5'},
})
crawler = get_crawler(TestSpider3, settings)
crawler = get_crawler(_TestSpider3, settings)

yield self.runner.crawl(crawler)
self.assertEqual(crawler.spider.success, 1)
Expand All @@ -236,16 +263,20 @@ def test_callback_requests_slot_map_with_num_slots(self):
@defer.inlineCallbacks
def test_start_requests_to_frontier(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
Response(url='http://example2.com'),
],
)

settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
settings.setdict({
'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER': True,
})
crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand All @@ -254,7 +285,7 @@ def test_start_requests_to_frontier(self):
@defer.inlineCallbacks
def test_start_requests_to_frontier_ii(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
self.setup_mocked_handler(mocked_handler)

with patch('frontera.contrib.backends.memory.MemoryBaseBackend.add_seeds') as mocked_add_seeds:
mocked_add_seeds.return_value = None
Expand All @@ -264,22 +295,26 @@ def test_start_requests_to_frontier_ii(self):
'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER': True,
})

crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertEqual(mocked_add_seeds.call_count, 1)

@defer.inlineCallbacks
def test_start_handle_errback(self):
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com', status=501),
Response(url='http://example3.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
Response(url='http://example2.com', status=501),
Response(url='http://example3.com'),
],
)

settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand All @@ -293,14 +328,18 @@ def test_start_handle_errback_with_cf_store(self):
Test that we get the expected result with errback cf_store
"""
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com', status=501, body=b'cf_store'),
Response(url='http://example3.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
Response(url='http://example2.com', status=501, body=b'cf_store'),
Response(url='http://example3.com'),
],
)

settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand All @@ -314,16 +353,20 @@ def test_start_handle_errback_with_cf_store_ii(self):
Test that we scheduled cf_store request on backend queue
"""
with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler:
mocked_handler.return_value = TestDownloadHandler()
mocked_handler.return_value.set_results([Response(url='http://example.com'),
Response(url='http://example2.com', status=501, body=b'cf_store'),
Response(url='http://example3.com')])
self.setup_mocked_handler(
mocked_handler,
[
Response(url='http://example.com'),
Response(url='http://example2.com', status=501, body=b'cf_store'),
Response(url='http://example3.com'),
],
)

with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule:
mocked_schedule.return_value = None
settings = Settings()
settings.setdict(TEST_SETTINGS, priority='cmdline')
crawler = get_crawler(TestSpider, settings)
crawler = get_crawler(_TestSpider, settings)

yield self.runner.crawl(crawler)
self.assertTrue(crawler.spider.success)
Expand Down
Loading

0 comments on commit 5b260db

Please sign in to comment.