diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..256b758 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,33 @@ +name: Tests +on: [push, pull_request] + +jobs: + tests: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - python-version: "3.8" + env: + TOXENV: pinned + - python-version: "3.8" + env: + TOXENV: py + - python-version: "3.9" + env: + TOXENV: py + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Run tests + env: ${{ matrix.env }} + run: | + pip install -U tox + tox diff --git a/pyproject.toml b/pyproject.toml index 55ec8d7..9392304 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,7 @@ [tool.black] line-length = 120 + +[tool.pytest.ini_options] +filterwarnings = [ + "ignore::DeprecationWarning:.*\\bfrontera\\b", +] diff --git a/scrapy_frontera/converters.py b/scrapy_frontera/converters.py index 6bc0e87..a493550 100644 --- a/scrapy_frontera/converters.py +++ b/scrapy_frontera/converters.py @@ -4,7 +4,6 @@ from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse -from scrapy.utils.request import request_fingerprint from w3lib.util import to_bytes, to_native_str @@ -23,6 +22,12 @@ class RequestConverter(BaseRequestConverter): def __init__(self, spider): self.spider = spider + crawler = spider.crawler + if hasattr(crawler, "request_fingerprinter"): + self.request_fingerprint = crawler.request_fingerprinter.fingerprint + else: + from scrapy.utils.request import request_fingerprint + self.request_fingerprint = request_fingerprint def to_frontier(self, scrapy_request): """request: Scrapy > Frontier""" @@ -56,7 +61,7 @@ def to_frontier(self, scrapy_request): fake_url = fingerprint_scrapy_request.url + str(uuid.uuid4()) fingerprint_scrapy_request = fingerprint_scrapy_request.replace(url=fake_url) meta[b"frontier_fingerprint"] = scrapy_request.meta.get( - "frontier_fingerprint", request_fingerprint(fingerprint_scrapy_request) + "frontier_fingerprint", self.request_fingerprint(fingerprint_scrapy_request) ) callback_slot_prefix_map = self.spider.crawler.settings.getdict("FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP") frontier_slot_prefix_num_slots = callback_slot_prefix_map.get(get_callback_name(scrapy_request)) diff --git a/setup.py b/setup.py index d344cb9..9115f52 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ packages = find_packages(), install_requires=( 'frontera==0.7.1', - 'scrapy', + 'scrapy>=1.7.0', ), classifiers = [ 'Development Status :: 5 - Production/Stable', diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index a7092e7..603f2e3 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -4,6 +4,7 @@ from twisted.internet import defer from scrapy import Request, Spider +from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler from scrapy.http import Response from scrapy.settings import Settings from scrapy.utils.test import get_crawler @@ -16,13 +17,14 @@ 'DOWNLOADER_MIDDLEWARES': { 'scrapy_frontera.middlewares.SchedulerDownloaderMiddleware': 0, }, + 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7', 'SPIDER_MIDDLEWARES': { 'scrapy_frontera.middlewares.SchedulerSpiderMiddleware': 0, }, } -class TestSpider(Spider): +class _TestSpider(Spider): name = 'test' success = False success2 = False @@ -55,7 +57,7 @@ def parse3(self, response): self.success3 = True -class TestSpider2(Spider): +class _TestSpider2(Spider): name = 'test' success = False success2 = False @@ -71,7 +73,7 @@ def parse2(self, response): self.success2 = True -class TestSpider3(Spider): +class _TestSpider3(Spider): name = 'test' success = 0 @@ -108,18 +110,33 @@ def tearDown(self): while TestDownloadHandler.results: TestDownloadHandler.results.pop() + @staticmethod + def setup_mocked_handler(mocked_handler, results=None): + handler = TestDownloadHandler() + if results: + handler.set_results(results) + if hasattr(HTTP11DownloadHandler, "from_crawler"): + mocked_handler.from_crawler.return_value = handler + else: + mocked_handler.return_value = handler + + @defer.inlineCallbacks def test_start_requests(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + Response(url='http://example2.com'), + ], + ) with patch('frontera.contrib.backends.memory.MemoryBaseBackend.links_extracted') as mocked_links_extracted: mocked_links_extracted.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) @@ -129,14 +146,18 @@ def test_start_requests(self): @defer.inlineCallbacks def test_cf_store(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com', body=b'cf_store')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com', body=b'cf_store'), + ], + ) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) @@ -145,8 +166,12 @@ def test_cf_store(self): @defer.inlineCallbacks def test_callback_requests_to_frontier(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + ], + ) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -155,7 +180,7 @@ def test_callback_requests_to_frontier(self): settings.setdict({ 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse2'], }) - crawler = get_crawler(TestSpider2, settings) + crawler = get_crawler(_TestSpider2, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) @@ -165,9 +190,13 @@ def test_callback_requests_to_frontier(self): @defer.inlineCallbacks def test_callback_requests_to_frontier_with_implicit_callback(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + Response(url='http://example2.com'), + ], + ) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -176,7 +205,7 @@ def test_callback_requests_to_frontier_with_implicit_callback(self): settings.setdict({ 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'], }) - crawler = get_crawler(TestSpider3, settings) + crawler = get_crawler(_TestSpider3, settings) yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, 1) @@ -185,10 +214,9 @@ def test_callback_requests_to_frontier_with_implicit_callback(self): @defer.inlineCallbacks def test_callback_requests_slot_map(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') - mocked_handler.return_value.set_results([resp1, resp2]) + self.setup_mocked_handler(mocked_handler, [resp1, resp2]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -198,7 +226,7 @@ def test_callback_requests_slot_map(self): 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'], 'FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP': {'parse': 'myslot'}, }) - crawler = get_crawler(TestSpider3, settings) + crawler = get_crawler(_TestSpider3, settings) yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, 1) @@ -210,10 +238,9 @@ def test_callback_requests_slot_map(self): @defer.inlineCallbacks def test_callback_requests_slot_map_with_num_slots(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') - mocked_handler.return_value.set_results([resp1, resp2]) + self.setup_mocked_handler(mocked_handler, [resp1, resp2]) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None @@ -223,7 +250,7 @@ def test_callback_requests_slot_map_with_num_slots(self): 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'], 'FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP': {'parse': 'myslot/5'}, }) - crawler = get_crawler(TestSpider3, settings) + crawler = get_crawler(_TestSpider3, settings) yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, 1) @@ -236,16 +263,20 @@ def test_callback_requests_slot_map_with_num_slots(self): @defer.inlineCallbacks def test_start_requests_to_frontier(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + Response(url='http://example2.com'), + ], + ) settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER': True, }) - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) @@ -254,7 +285,7 @@ def test_start_requests_to_frontier(self): @defer.inlineCallbacks def test_start_requests_to_frontier_ii(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() + self.setup_mocked_handler(mocked_handler) with patch('frontera.contrib.backends.memory.MemoryBaseBackend.add_seeds') as mocked_add_seeds: mocked_add_seeds.return_value = None @@ -264,7 +295,7 @@ def test_start_requests_to_frontier_ii(self): 'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER': True, }) - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertEqual(mocked_add_seeds.call_count, 1) @@ -272,14 +303,18 @@ def test_start_requests_to_frontier_ii(self): @defer.inlineCallbacks def test_start_handle_errback(self): with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com', status=501), - Response(url='http://example3.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + Response(url='http://example2.com', status=501), + Response(url='http://example3.com'), + ], + ) settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) @@ -293,14 +328,18 @@ def test_start_handle_errback_with_cf_store(self): Test that we get the expected result with errback cf_store """ with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com', status=501, body=b'cf_store'), - Response(url='http://example3.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + Response(url='http://example2.com', status=501, body=b'cf_store'), + Response(url='http://example3.com'), + ], + ) settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) @@ -314,16 +353,20 @@ def test_start_handle_errback_with_cf_store_ii(self): Test that we scheduled cf_store request on backend queue """ with patch('scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler') as mocked_handler: - mocked_handler.return_value = TestDownloadHandler() - mocked_handler.return_value.set_results([Response(url='http://example.com'), - Response(url='http://example2.com', status=501, body=b'cf_store'), - Response(url='http://example3.com')]) + self.setup_mocked_handler( + mocked_handler, + [ + Response(url='http://example.com'), + Response(url='http://example2.com', status=501, body=b'cf_store'), + Response(url='http://example3.com'), + ], + ) with patch('frontera.contrib.backends.memory.MemoryDequeQueue.schedule') as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') - crawler = get_crawler(TestSpider, settings) + crawler = get_crawler(_TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..471b7dd --- /dev/null +++ b/tox.ini @@ -0,0 +1,18 @@ +[tox] +envlist = pinned,py38,py39 + +[testenv] +deps = + pytest +commands = + pytest {posargs:tests} + +[testenv:pinned] +basepython = python3.8 +deps = + {[testenv]deps} + cryptography==35.0.0 + pyOpenSSL==21.0.0 + scrapy==1.7.0 + twisted==19.2.0 + zope.interface==4.7.0