diff --git a/ftest/crawlers/echojs.py b/ftest/crawlers/echojs.py index 3575f23fac..ce8a88f2ab 100644 --- a/ftest/crawlers/echojs.py +++ b/ftest/crawlers/echojs.py @@ -33,6 +33,13 @@ def process(self, job, response): return +class CrashingEchoJSSpider(Spider): + START_URL = "https://echojs.com/latest" + + def process(self, job, response): + raise RuntimeError("crashed!") + + class EchoJSStartSpider(Spider): START_URL = "https://echojs.com/latest" diff --git a/minet/cli/crawl/crawl.py b/minet/cli/crawl/crawl.py index 3012311a01..0d47fd0527 100644 --- a/minet/cli/crawl/crawl.py +++ b/minet/cli/crawl/crawl.py @@ -28,7 +28,10 @@ Spider, BasicSpider, ) -from minet.crawl.exceptions import CrawlerAlreadyFinishedError +from minet.crawl.exceptions import ( + CrawlerAlreadyFinishedError, + CrawlerSpiderProcessError, +) from minet.cli.console import console from minet.cli.loading_bar import LoadingBar from minet.cli.utils import ( @@ -353,33 +356,52 @@ def callback(self: Crawler, result: SuccessfulCrawlResult) -> Optional[str]: track_crawler_state_with_loading_bar(loading_bar, crawler.state) # Running crawler - for result, result_path in crawler.crawl(callback=callback): - with loading_bar.step(): - if cli_args.verbose: - console.print(result, highlight=True) + try: + for result, result_path in crawler.crawl(callback=callback): + with loading_bar.step(): + if cli_args.verbose: + console.print(result, highlight=True) + + if result_callback is not None: + result_callback(cli_args, loading_bar, result) + + job_row = result.as_csv_row() + + if cli_args.write_files: + job_row += [result_path] - if result_callback is not None: - result_callback(cli_args, loading_bar, result) + if format_job_row_addendum is not None: + job_row += format_job_row_addendum(result) - job_row = result.as_csv_row() + jobs_writer.writerow(job_row) - if cli_args.write_files: - job_row += [result_path] + # Flushing to avoid sync issues as well as possible + jobs_output.flush() - if format_job_row_addendum is not None: - job_row += format_job_row_addendum(result) + if result.error is not None: + loading_bar.inc_stat(result.error_code, style="error") + continue - jobs_writer.writerow(job_row) + if data_writer is not None: + data_writer.write(result) + + except CrawlerSpiderProcessError as error: + loading_bar.fail() + job = error.job + + base_msg = ( + "Default spider process errored" + if job.spider is None + else ('Spider "%s" process errored' % job.spider) + ) - # Flushing to avoid sync issues as well as possible - jobs_output.flush() + base_msg += " with %s!" % error.reason.__class__.__name__ - if result.error is not None: - loading_bar.inc_stat(result.error_code, style="error") - continue + console.print(base_msg, style="error") + console.print(job, highlight=True) + console.print(error.response, highlight=True) - if data_writer is not None: - data_writer.write(result) + raise error.reason action = crawl_action diff --git a/minet/cli/loading_bar.py b/minet/cli/loading_bar.py index 9902f3e46e..07153cc2f4 100644 --- a/minet/cli/loading_bar.py +++ b/minet/cli/loading_bar.py @@ -352,6 +352,15 @@ def cursor_up(self) -> None: # NOTE: cursor 1up console.file.write("\x1b[1A") + def set_style(self, style: str) -> None: + if self.bar_column is None: + return + + self.bar_column.complete_style = style + self.bar_column.finished_style = style + self.bar_column.pulse_style = style + self.bar_column.style = style + def start(self) -> None: self.live.start() self.live.refresh() @@ -366,6 +375,10 @@ def stop(self, erase=False) -> None: self.live.stop() self.already_stopped = True + def fail(self) -> None: + self.set_style("error") + self.stop() + def erase(self) -> None: self.stop(erase=True) diff --git a/minet/crawl/crawler.py b/minet/crawl/crawler.py index ced39699b4..ededcc13c5 100644 --- a/minet/crawl/crawler.py +++ b/minet/crawl/crawler.py @@ -49,7 +49,10 @@ FunctionSpider, FunctionSpiderCallable, ) -from minet.crawl.exceptions import CrawlerAlreadyFinishedError +from minet.crawl.exceptions import ( + CrawlerAlreadyFinishedError, + CrawlerSpiderProcessError, +) from minet.crawl.queue import CrawlerQueue, AnyParallelism, AnyThrottle from minet.crawl.state import CrawlerState from minet.crawl.url_cache import URLCache @@ -215,7 +218,12 @@ def __call__( if cancel_event.is_set(): return - spider_result = spider.process(job, response) + try: + spider_result = spider.process(job, response) + except Exception as reason: + raise CrawlerSpiderProcessError( + reason=reason, job=job, response=response + ) if spider_result is not None: try: diff --git a/minet/crawl/exceptions.py b/minet/crawl/exceptions.py index cf70e9c6f6..ab8a7d6548 100644 --- a/minet/crawl/exceptions.py +++ b/minet/crawl/exceptions.py @@ -1,5 +1,11 @@ +from typing import TYPE_CHECKING + from minet.exceptions import MinetError +if TYPE_CHECKING: + from minet.crawl.types import CrawlJob + from minet.web import Response + class CrawlerError(MinetError): pass @@ -7,3 +13,11 @@ class CrawlerError(MinetError): class CrawlerAlreadyFinishedError(CrawlerError): pass + + +class CrawlerSpiderProcessError(CrawlerError): + def __init__(self, reason: Exception, job: "CrawlJob", response: "Response"): + self.reason = reason + self.job = job + self.response = response + super().__init__(str(reason))