Providing more context on spider process errors

Fix #979
medialab · Sep 5, 2024 · 5c102a4 · 5c102a4
1 parent 39c98a8
commit 5c102a4
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 22 deletions.
diff --git a/ftest/crawlers/echojs.py b/ftest/crawlers/echojs.py
@@ -33,6 +33,13 @@ def process(self, job, response):
         return
 
 
+class CrashingEchoJSSpider(Spider):
+    START_URL = "https://echojs.com/latest"
+
+    def process(self, job, response):
+        raise RuntimeError("crashed!")
+
+
 class EchoJSStartSpider(Spider):
     START_URL = "https://echojs.com/latest"
 

diff --git a/minet/cli/crawl/crawl.py b/minet/cli/crawl/crawl.py
@@ -28,7 +28,10 @@
     Spider,
     BasicSpider,
 )
-from minet.crawl.exceptions import CrawlerAlreadyFinishedError
+from minet.crawl.exceptions import (
+    CrawlerAlreadyFinishedError,
+    CrawlerSpiderProcessError,
+)
 from minet.cli.console import console
 from minet.cli.loading_bar import LoadingBar
 from minet.cli.utils import (
@@ -353,33 +356,52 @@ def callback(self: Crawler, result: SuccessfulCrawlResult) -> Optional[str]:
         track_crawler_state_with_loading_bar(loading_bar, crawler.state)
 
         # Running crawler
-        for result, result_path in crawler.crawl(callback=callback):
-            with loading_bar.step():
-                if cli_args.verbose:
-                    console.print(result, highlight=True)
+        try:
+            for result, result_path in crawler.crawl(callback=callback):
+                with loading_bar.step():
+                    if cli_args.verbose:
+                        console.print(result, highlight=True)
+
+                    if result_callback is not None:
+                        result_callback(cli_args, loading_bar, result)
+
+                    job_row = result.as_csv_row()
+
+                    if cli_args.write_files:
+                        job_row += [result_path]
 
-                if result_callback is not None:
-                    result_callback(cli_args, loading_bar, result)
+                    if format_job_row_addendum is not None:
+                        job_row += format_job_row_addendum(result)
 
-                job_row = result.as_csv_row()
+                    jobs_writer.writerow(job_row)
 
-                if cli_args.write_files:
-                    job_row += [result_path]
+                    # Flushing to avoid sync issues as well as possible
+                    jobs_output.flush()
 
-                if format_job_row_addendum is not None:
-                    job_row += format_job_row_addendum(result)
+                    if result.error is not None:
+                        loading_bar.inc_stat(result.error_code, style="error")
+                        continue
 
-                jobs_writer.writerow(job_row)
+                    if data_writer is not None:
+                        data_writer.write(result)
+
+        except CrawlerSpiderProcessError as error:
+            loading_bar.fail()
+            job = error.job
+
+            base_msg = (
+                "Default spider process errored"
+                if job.spider is None
+                else ('Spider "%s" process errored' % job.spider)
+            )
 
-                # Flushing to avoid sync issues as well as possible
-                jobs_output.flush()
+            base_msg += " with %s!" % error.reason.__class__.__name__
 
-                if result.error is not None:
-                    loading_bar.inc_stat(result.error_code, style="error")
-                    continue
+            console.print(base_msg, style="error")
+            console.print(job, highlight=True)
+            console.print(error.response, highlight=True)
 
-                if data_writer is not None:
-                    data_writer.write(result)
+            raise error.reason
 
 
 action = crawl_action
diff --git a/minet/cli/loading_bar.py b/minet/cli/loading_bar.py
@@ -352,6 +352,15 @@ def cursor_up(self) -> None:
         # NOTE: cursor 1up
         console.file.write("\x1b[1A")
 
+    def set_style(self, style: str) -> None:
+        if self.bar_column is None:
+            return
+
+        self.bar_column.complete_style = style
+        self.bar_column.finished_style = style
+        self.bar_column.pulse_style = style
+        self.bar_column.style = style
+
     def start(self) -> None:
         self.live.start()
         self.live.refresh()
@@ -366,6 +375,10 @@ def stop(self, erase=False) -> None:
         self.live.stop()
         self.already_stopped = True
 
+    def fail(self) -> None:
+        self.set_style("error")
+        self.stop()
+
     def erase(self) -> None:
         self.stop(erase=True)
 

diff --git a/minet/crawl/crawler.py b/minet/crawl/crawler.py
@@ -49,7 +49,10 @@
     FunctionSpider,
     FunctionSpiderCallable,
 )
-from minet.crawl.exceptions import CrawlerAlreadyFinishedError
+from minet.crawl.exceptions import (
+    CrawlerAlreadyFinishedError,
+    CrawlerSpiderProcessError,
+)
 from minet.crawl.queue import CrawlerQueue, AnyParallelism, AnyThrottle
 from minet.crawl.state import CrawlerState
 from minet.crawl.url_cache import URLCache
@@ -215,7 +218,12 @@ def __call__(
             if cancel_event.is_set():
                 return
 
-            spider_result = spider.process(job, response)
+            try:
+                spider_result = spider.process(job, response)
+            except Exception as reason:
+                raise CrawlerSpiderProcessError(
+                    reason=reason, job=job, response=response
+                )
 
             if spider_result is not None:
                 try:

diff --git a/minet/crawl/exceptions.py b/minet/crawl/exceptions.py
@@ -1,9 +1,23 @@
+from typing import TYPE_CHECKING
+
 from minet.exceptions import MinetError
 
+if TYPE_CHECKING:
+    from minet.crawl.types import CrawlJob
+    from minet.web import Response
+
 
 class CrawlerError(MinetError):
     pass
 
 
 class CrawlerAlreadyFinishedError(CrawlerError):
     pass
+
+
+class CrawlerSpiderProcessError(CrawlerError):
+    def __init__(self, reason: Exception, job: "CrawlJob", response: "Response"):
+        self.reason = reason
+        self.job = job
+        self.response = response
+        super().__init__(str(reason))