Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make the errback handling method configurable #156

Merged
merged 5 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,17 @@ Encoding that's used to encode log messages.

Default: ``utf-8``.

DEFAULT_ERRBACK_NAME
~~~~~~~~~~~~~~~~~~~~

Default: ``"parse"``

The name of the default errback_.

Use an empty string or ``None`` to unset the errback altogether.

.. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing


Spider settings
---------------
Expand Down
4 changes: 3 additions & 1 deletion scrapyrt/conf/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,6 @@
# disable in production
DEBUG = True

TWISTED_REACTOR = None
TWISTED_REACTOR = None

DEFAULT_ERRBACK_NAME = 'parse'
41 changes: 26 additions & 15 deletions scrapyrt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from copy import deepcopy
import datetime
import os
import traceback

from scrapy import signals
from scrapy.crawler import CrawlerRunner, Crawler
Expand Down Expand Up @@ -109,6 +110,7 @@ def __init__(self, spider_name, request_kwargs,
self.items = []
self.items_dropped = []
self.errors = []
self.user_error = None
self.max_requests = int(max_requests) if max_requests else None
self.timeout_limit = int(app_settings.TIMEOUT_LIMIT)
self.request_count = 0
Expand All @@ -120,7 +122,7 @@ def __init__(self, spider_name, request_kwargs,
# because we need to know if spider has method available
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
# do the same for errback
self.errback_name = request_kwargs.pop('errback', None) or 'parse'
self.errback_name = request_kwargs.pop('errback', None) or app_settings.DEFAULT_ERRBACK_NAME

if request_kwargs.get("url"):
self.request = self.create_spider_request(deepcopy(request_kwargs))
Expand Down Expand Up @@ -171,20 +173,26 @@ def spider_idle(self, spider):

"""
if spider is self.crawler.spider and self.request and not self._request_scheduled:
callback = getattr(self.crawler.spider, self.callback_name)
assert callable(callback), 'Invalid callback'
self.request = self.request.replace(callback=callback)

errback = getattr(self.crawler.spider, self.errback_name)
assert callable(errback), 'Invalid errback'
self.request = self.request.replace(errback=errback)
modify_request = getattr(
self.crawler.spider, "modify_realtime_request", None)
if callable(modify_request):
self.request = modify_request(self.request)
spider.crawler.engine.crawl(self.request)
self._request_scheduled = True
raise DontCloseSpider
try:
callback = getattr(self.crawler.spider, self.callback_name)
assert callable(callback), 'Invalid callback'
self.request = self.request.replace(callback=callback)


if self.errback_name:
errback = getattr(self.crawler.spider, self.errback_name)
assert callable(errback), 'Invalid errback'
self.request = self.request.replace(errback=errback)
modify_request = getattr(
self.crawler.spider, "modify_realtime_request", None)
if callable(modify_request):
self.request = modify_request(self.request)
spider.crawler.engine.crawl(self.request)
self._request_scheduled = True
except Exception as e:
self.user_error = Error(400, message=traceback.format_exc())
else:
raise DontCloseSpider

def handle_scheduling(self, request, spider):
"""Handler of request_scheduled signal.
Expand Down Expand Up @@ -238,6 +246,9 @@ def return_items(self, result):
"stats": stats,
"spider_name": self.spider_name,
}

results["user_error"] = self.user_error

if self.debug:
results["errors"] = self.errors
return results
Expand Down
3 changes: 3 additions & 0 deletions scrapyrt/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,9 @@ def run_crawl(self, spider_name, scrapy_request_args,

def prepare_response(self, result, *args, **kwargs):
items = result.get("items")
user_error = result.get("user_error", None)
if user_error:
raise user_error
response = {
"status": "ok",
"items": items,
Expand Down
15 changes: 10 additions & 5 deletions tests/test_crawl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ def test_spider_opened(self):

def test_raise_error_if_not_callable(self):
self.spider.parse_something = None
self.assertRaises(
AssertionError, self.crawl_manager.spider_idle, self.spider)
self._call_spider_idle()
self.assertIsNotNone(self.crawl_manager.user_error)
msg = "Invalid callback"
assert re.search(msg, self.crawl_manager.user_error.message)
self.assertFalse(self.crawler.engine.crawl.called)

def test_modify_realtime_request(self):
Expand Down Expand Up @@ -142,15 +144,17 @@ def test_pass_wrong_spider_errback(self):
mng = self.create_crawl_manager(
{'url': 'http://localhost', 'errback': 'handle_error'}
)

try:
with pytest.raises(AttributeError) as err:
mng.spider_idle(self.spider)
mng.spider_idle(self.spider)
except DontCloseSpider:
pass

assert mng.request.errback is None

self.assertIsNotNone(mng.user_error)
msg = "has no attribute 'handle_error'"
assert re.search(msg, str(err))
assert re.search(msg, mng.user_error.message)

def test_pass_good_spider_errback(self):
mng = self.create_crawl_manager(
Expand Down Expand Up @@ -330,6 +334,7 @@ def setUp(self):
'items_dropped': self.crawl_manager.items_dropped,
'stats': self.stats.copy(),
'spider_name': self.spider.name,
'user_error': None,
}

def test_return_items(self):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_resource_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,17 @@ def test_prepare_response(self, resource):
for key, value in expected:
assert prepared_res[key] == value

def test_prepare_response_user_error_raised(self, resource):
result = {
'items': [1, 2],
'stats': [99],
'spider_name': 'test'
}
result['user_error'] = Exception("my exception")
with pytest.raises(Exception) as e_info:
resource.prepare_response(result)
assert e_info.message == "my exception"


class TestCrawlResourceGetRequiredArgument(unittest.TestCase):

Expand Down