diff --git a/README.md b/README.md index 954c2f8..feb4d09 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,13 @@ For connecting to and using the FOLIO APIs. All properties are **required**. | tenant_id | FOLIO Tenant ID | Y | | username | FOLIO username | Y | | password | FOLIO password | Y | -| strategy | Name of the strategy to use. See [Folio Stratgy](#folio-strategy) below. | Y | +| strategy | Name of the strategy to use. See [Folio Strategy](#folio-strategy) below. | Y | | query_limit | Number of records requested in each FOLIO API call. Note: for SrsInstanceIdsStrategy, this must be approximately 30 or lower so that the maximum query string length is not exceeded. | Y | | batch_limit | Number of records tested for each output file. Must be equal to or a multiple of query_limit. The actual file will contain only those records which had bad URLs. | Y | ### WebTester Section -For testing each URL. +For loading and examining the response to each URL. | Property | Description | Required | |----------|-------------|---------| @@ -42,6 +42,17 @@ For testing each URL. | allow_list | Comma-separated list of strings. If present, only URLs including one of these strings will be tested. | N | | block_list | Comma-separated list of strings. If present, URLs that include one of these strings will be skipped. `block_list` is ignored if `allow_list` is present. | N | +### UrlParser Section + +String pattern tests examining the URL itself. + +| Property | Description | Required | +|----------|-------------|---------| +| proxy_prefix | A proxy prefix string, used by the two `proxy_report*` parameters. | N | +| proxy_prefix_common_part | Default is `?url=`. A brief substring at the end of `proxy_prefix` that is also expected to be at the end of any incorrect prefix. | N | +| proxy_report_no_prefix | If True, the CSV will include a column flagging any URL that does not start with this prefix. | N | +| proxy_report_wrong_prefix | If True, the CSV will include a column flagging any URL which includes this substring but does not start with the full `proxy_prefix`. | N | + ### Logging Section | Property | Description | Required | diff --git a/example.properties b/example.properties index c782e31..48a5b2b 100644 --- a/example.properties +++ b/example.properties @@ -15,5 +15,11 @@ request_timeout = 10 # allow_list = abc.com, def.com # block_list = ghi.com, jkl.com +[UrlParser] +# proxy_prefix = https://proxy.lib.leafygreenuniversity.edu/login?url= +# proxy_prefix_common_part = ?url= +# proxy_report_no_prefix = True +# proxy_report_wrong_prefix = True + [Logging] log_file = folio_bad_urls.log diff --git a/folio_bad_urls/data.py b/folio_bad_urls/data.py index df5815c..82479ec 100644 --- a/folio_bad_urls/data.py +++ b/folio_bad_urls/data.py @@ -10,18 +10,21 @@ def __repr__(self): class TestResult: """ The result of testing a URL from a record. """ - def __init__(self, instance_hrid, url, status_code, permanent_redirect=None): + def __init__(self, instance_hrid, url, status_code, permanent_redirect=None, + parser_result=None): + self.instance_hrid = instance_hrid self.url = url self.status_code = status_code self.permanent_redirect = permanent_redirect - - def is_insecure_url(self): - return not self.url.startswith("https:") + self._parser_result = parser_result def is_bad_url(self): return self.status_code != 200 + def parser_result(self): + return self._parser_result + def __repr__(self): return str(self.__dict__) @@ -29,3 +32,20 @@ class LocalStatusCode: CONNECTION_FAILED = 0 ROBOTS_TXT_BLOCKS_URL = -10 ROBOTS_TXT_TIMEOUT_EXCESSIVE = -11 + +class ParserResult: + """ The result of string scanning of a URL from a record. """ + + def __init__(self, *, insecure_url, no_proxy_prefix, wrong_proxy_prefix): + self._insecure_url = insecure_url + self._no_proxy_prefix = no_proxy_prefix + self._wrong_proxy_prefix = wrong_proxy_prefix + + def is_insecure_url(self): + return self._insecure_url + + def has_no_proxy_prefix(self): + return self._no_proxy_prefix + + def has_wrong_proxy_prefix(self): + return self._wrong_proxy_prefix diff --git a/folio_bad_urls/reporter.py b/folio_bad_urls/reporter.py index 071c776..4319f97 100644 --- a/folio_bad_urls/reporter.py +++ b/folio_bad_urls/reporter.py @@ -3,20 +3,29 @@ log = logging.getLogger(__name__) log.setLevel(logging.INFO) -HEADER = "instance_hrid, url, status_code, insecure_url, permanent_redirect\n" - class Reporter: """ Save bad URLs to a file. """ + STANDARD_HEADER_FIELDS = [ + "instance_hrid", + "url", + "status_code", + "permanent_redirect", + "insecure_url", + ] + def __init__(self, config): self._config = config log.addHandler(self._config.log_file_handler) + self._PROXY_REPORT_NO_PREFIX = bool(config.get('UrlParser', 'proxy_report_no_prefix', fallback=False)) + self._PROXY_REPORT_WRONG_PREFIX = bool(config.get('UrlParser', 'proxy_report_wrong_prefix', fallback=False)) + def write_results(self, offset, results): filename = f'result_{str(offset)}.csv' bad_urls = 0 with open(filename, 'w') as file: - file.write(HEADER) + file.write(self._format_header()) for result in results: if result.is_bad_url(): bad_urls += 1 @@ -25,10 +34,27 @@ def write_results(self, offset, results): log.info(f"Wrote file with {bad_urls} bad URLs.") return bad_urls + def _format_header(self): + header_fields = Reporter.STANDARD_HEADER_FIELDS + if self._PROXY_REPORT_NO_PREFIX: + header_fields.append("proxy_no_prefix") + if self._PROXY_REPORT_WRONG_PREFIX: + header_fields.append("proxy_wrong_prefix") + return ", ".join(header_fields) + "\n" + def _format_result(self, result): - return f"{result.instance_hrid}, \ + result_string = f"{result.instance_hrid}, \ {result.url}, \ {result.status_code}, \ - {result.is_insecure_url() if result.is_insecure_url() else ''}, \ - {result.permanent_redirect if result.permanent_redirect else ''} \ - \n" + {result.permanent_redirect if result.permanent_redirect else ''}, \ + {self._format_bool(result.parser_result().is_insecure_url())} \ + " + if self._PROXY_REPORT_NO_PREFIX: + result_string += self._format_bool(result.parser_result().has_no_proxy_prefix()) + if self._PROXY_REPORT_WRONG_PREFIX: + result_string += self._format_bool(result.parser_result().has_wrong_proxy_prefix()) + result_string += "\n" + return result_string + + def _format_bool(self, value): + return 'Y,' if value else ',' diff --git a/folio_bad_urls/url_parser.py b/folio_bad_urls/url_parser.py new file mode 100644 index 0000000..6833725 --- /dev/null +++ b/folio_bad_urls/url_parser.py @@ -0,0 +1,44 @@ +import logging + +from folio_bad_urls.data import ParserResult + +log = logging.getLogger(__name__) +log.setLevel(logging.INFO) +# log.setLevel(logging.DEBUG) + +class UrlParser: + """ Parse URLs for lexical result information. """ + + def __init__(self, config): + self._PROXY_PREFIX = config.get('UrlParser', 'proxy_prefix', fallback=None) + self._PROXY_PREFIX_COMMON_PART = config.get('UrlParser', 'proxy_prefix_common_part', fallback="?url=") + self._PROXY_REPORT_NO_PREFIX = bool(config.get('UrlParser', 'proxy_report_no_prefix', fallback=False)) + self._PROXY_REPORT_WRONG_PREFIX = bool(config.get('UrlParser', 'proxy_report_wrong_prefix', fallback=False)) + + # validate config + if self._PROXY_REPORT_NO_PREFIX and not self._PROXY_PREFIX: + raise Exception("Parameter proxy_prefix required for proxy_report_no_prefix") + if self._PROXY_REPORT_WRONG_PREFIX and not self._PROXY_PREFIX: + raise Exception("Parameter proxy_prefix required for proxy_report_wrong_prefix") + + def parse(self, url): + parser_result = ParserResult( + insecure_url=self._test_insecure_url(url), + no_proxy_prefix=self._test_no_proxy_prefix(url), + wrong_proxy_prefix=self._test_wrong_proxy_prefix(url) + ) + return parser_result + + def _test_insecure_url(self, url): + return not url.startswith("https:") + + def _test_no_proxy_prefix(self, url): + if not self._PROXY_REPORT_NO_PREFIX: + return None + return not url.startswith(self._PROXY_PREFIX) + + def _test_wrong_proxy_prefix(self, url): + if not self._PROXY_REPORT_WRONG_PREFIX: + return None + return self._PROXY_PREFIX_COMMON_PART in url \ + and not url.startswith(self._PROXY_PREFIX) diff --git a/folio_bad_urls/web.py b/folio_bad_urls/web.py index 4fb47fc..17a2e39 100644 --- a/folio_bad_urls/web.py +++ b/folio_bad_urls/web.py @@ -5,6 +5,7 @@ import urllib.robotparser from folio_bad_urls.data import ElectronicRecord, TestResult, LocalStatusCode +from folio_bad_urls.url_parser import UrlParser log = logging.getLogger(__name__) log.setLevel(logging.INFO) @@ -25,6 +26,7 @@ def __init__(self, config): self._MAX_CRAWL_DELAY = float(self._config.get('WebTester', 'max_crawl_delay')) self._REQUEST_TIMEOUT = float(self._config.get('WebTester', 'request_timeout')) + self._parser = UrlParser(config) self._crawl_rules = dict() self._last_query_time = dict() self._init_filters() @@ -46,6 +48,9 @@ def _init_filters(self): def test_record(self, record: ElectronicRecord): url = record.url + # check static URL parsing results + parser_result = self._parser.parse(url) + # check local filters if not self._check_filters(url): log.debug(f"Skipping URL due to filters: {url}") @@ -55,10 +60,12 @@ def test_record(self, record: ElectronicRecord): rules = self._check_crawl_rules(url) if not rules.can_fetch(url): log.warn(f"Robots.txt blocks URL: {url}") - return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_BLOCKS_URL) + return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_BLOCKS_URL, + parser_result=parser_result) pause_ok = self._pause_if_needed(url, rules) if not pause_ok: - return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_TIMEOUT_EXCESSIVE) + return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_TIMEOUT_EXCESSIVE, + parser_result=parser_result) # load URL and check response try: @@ -66,13 +73,16 @@ def test_record(self, record: ElectronicRecord): status_code = int(response.status_code) last_permanent_redirect = self._get_last_permanent_redirect(response, url) log.debug(f"Got status code {status_code} for url {url}") - return TestResult(record.instance_hrid, url, status_code, permanent_redirect=last_permanent_redirect) + return TestResult(record.instance_hrid, url, status_code, permanent_redirect=last_permanent_redirect, + parser_result=parser_result) except requests.exceptions.Timeout: log.debug(f"Request timed out for url {url}") - return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED) + return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED, + parser_result=parser_result) except requests.exceptions.RequestException as e: log.warn(f"Caught unexpected RequestException with url {url}: {e}") - return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED) + return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED, + parser_result=parser_result) def _check_filters(self, url): # check allow list