Skip to content

Commit

Permalink
Providing more context on spider process errors
Browse files Browse the repository at this point in the history
Fix #979
  • Loading branch information
Yomguithereal committed Sep 5, 2024
1 parent 39c98a8 commit 5c102a4
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 22 deletions.
7 changes: 7 additions & 0 deletions ftest/crawlers/echojs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ def process(self, job, response):
return


class CrashingEchoJSSpider(Spider):
START_URL = "https://echojs.com/latest"

def process(self, job, response):
raise RuntimeError("crashed!")


class EchoJSStartSpider(Spider):
START_URL = "https://echojs.com/latest"

Expand Down
62 changes: 42 additions & 20 deletions minet/cli/crawl/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
Spider,
BasicSpider,
)
from minet.crawl.exceptions import CrawlerAlreadyFinishedError
from minet.crawl.exceptions import (
CrawlerAlreadyFinishedError,
CrawlerSpiderProcessError,
)
from minet.cli.console import console
from minet.cli.loading_bar import LoadingBar
from minet.cli.utils import (
Expand Down Expand Up @@ -353,33 +356,52 @@ def callback(self: Crawler, result: SuccessfulCrawlResult) -> Optional[str]:
track_crawler_state_with_loading_bar(loading_bar, crawler.state)

# Running crawler
for result, result_path in crawler.crawl(callback=callback):
with loading_bar.step():
if cli_args.verbose:
console.print(result, highlight=True)
try:
for result, result_path in crawler.crawl(callback=callback):
with loading_bar.step():
if cli_args.verbose:
console.print(result, highlight=True)

if result_callback is not None:
result_callback(cli_args, loading_bar, result)

job_row = result.as_csv_row()

if cli_args.write_files:
job_row += [result_path]

if result_callback is not None:
result_callback(cli_args, loading_bar, result)
if format_job_row_addendum is not None:
job_row += format_job_row_addendum(result)

job_row = result.as_csv_row()
jobs_writer.writerow(job_row)

if cli_args.write_files:
job_row += [result_path]
# Flushing to avoid sync issues as well as possible
jobs_output.flush()

if format_job_row_addendum is not None:
job_row += format_job_row_addendum(result)
if result.error is not None:
loading_bar.inc_stat(result.error_code, style="error")
continue

jobs_writer.writerow(job_row)
if data_writer is not None:
data_writer.write(result)

except CrawlerSpiderProcessError as error:
loading_bar.fail()
job = error.job

base_msg = (
"Default spider process errored"
if job.spider is None
else ('Spider "%s" process errored' % job.spider)
)

# Flushing to avoid sync issues as well as possible
jobs_output.flush()
base_msg += " with %s!" % error.reason.__class__.__name__

if result.error is not None:
loading_bar.inc_stat(result.error_code, style="error")
continue
console.print(base_msg, style="error")
console.print(job, highlight=True)
console.print(error.response, highlight=True)

if data_writer is not None:
data_writer.write(result)
raise error.reason


action = crawl_action
13 changes: 13 additions & 0 deletions minet/cli/loading_bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,15 @@ def cursor_up(self) -> None:
# NOTE: cursor 1up
console.file.write("\x1b[1A")

def set_style(self, style: str) -> None:
if self.bar_column is None:
return

self.bar_column.complete_style = style
self.bar_column.finished_style = style
self.bar_column.pulse_style = style
self.bar_column.style = style

def start(self) -> None:
self.live.start()
self.live.refresh()
Expand All @@ -366,6 +375,10 @@ def stop(self, erase=False) -> None:
self.live.stop()
self.already_stopped = True

def fail(self) -> None:
self.set_style("error")
self.stop()

def erase(self) -> None:
self.stop(erase=True)

Expand Down
12 changes: 10 additions & 2 deletions minet/crawl/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@
FunctionSpider,
FunctionSpiderCallable,
)
from minet.crawl.exceptions import CrawlerAlreadyFinishedError
from minet.crawl.exceptions import (
CrawlerAlreadyFinishedError,
CrawlerSpiderProcessError,
)
from minet.crawl.queue import CrawlerQueue, AnyParallelism, AnyThrottle
from minet.crawl.state import CrawlerState
from minet.crawl.url_cache import URLCache
Expand Down Expand Up @@ -215,7 +218,12 @@ def __call__(
if cancel_event.is_set():
return

spider_result = spider.process(job, response)
try:
spider_result = spider.process(job, response)
except Exception as reason:
raise CrawlerSpiderProcessError(
reason=reason, job=job, response=response
)

if spider_result is not None:
try:
Expand Down
14 changes: 14 additions & 0 deletions minet/crawl/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
from typing import TYPE_CHECKING

from minet.exceptions import MinetError

if TYPE_CHECKING:
from minet.crawl.types import CrawlJob
from minet.web import Response


class CrawlerError(MinetError):
pass


class CrawlerAlreadyFinishedError(CrawlerError):
pass


class CrawlerSpiderProcessError(CrawlerError):
def __init__(self, reason: Exception, job: "CrawlJob", response: "Response"):
self.reason = reason
self.job = job
self.response = response
super().__init__(str(reason))

0 comments on commit 5c102a4

Please sign in to comment.