lehigh-university-libraries · maccabeelevine · May 4, 2022
diff --git a/README.md b/README.md
@@ -26,13 +26,13 @@ For connecting to and using the FOLIO APIs.  All properties are **required**.
 | tenant_id | FOLIO Tenant ID | Y |
 | username | FOLIO username | Y | 
 | password | FOLIO password | Y |
-| strategy | Name of the strategy to use.  See [Folio Stratgy](#folio-strategy) below. | Y |
+| strategy | Name of the strategy to use.  See [Folio Strategy](#folio-strategy) below. | Y |
 | query_limit | Number of records requested in each FOLIO API call.  Note: for SrsInstanceIdsStrategy, this must be approximately 30 or lower so that the maximum query string length is not exceeded. | Y |
 | batch_limit | Number of records tested for each output file.  Must be equal to or a multiple of query_limit.  The actual file will contain only those records which had bad URLs. | Y |
 
 ### WebTester Section
 
-For testing each URL.
+For loading and examining the response to each URL.
 
 | Property | Description | Required |
 |----------|-------------|---------|
@@ -42,6 +42,17 @@ For testing each URL.
 | allow_list | Comma-separated list of strings.  If present, only URLs including one of these strings will be tested. | N |
 | block_list | Comma-separated list of strings.  If present, URLs that include one of these strings will be skipped.  `block_list` is ignored if `allow_list` is present. | N |
 
+### UrlParser Section
+
+String pattern tests examining the URL itself.
+
+| Property | Description | Required |
+|----------|-------------|---------|
+| proxy_prefix | A proxy prefix string, used by the two `proxy_report*` parameters. | N |
+| proxy_prefix_common_part | Default is `?url=`.  A brief substring at the end of `proxy_prefix` that is also expected to be at the end of any incorrect prefix. | N | 
+| proxy_report_no_prefix | If True, the CSV will include a column flagging any URL that does not start with this prefix. | N |
+| proxy_report_wrong_prefix |  If True, the CSV will include a column flagging any URL which includes this substring but does not start with the full `proxy_prefix`. | N |
+
 ### Logging Section
 
 | Property | Description | Required |

diff --git a/example.properties b/example.properties
@@ -15,5 +15,11 @@ request_timeout             = 10
 # allow_list                  = abc.com, def.com
 # block_list                  = ghi.com, jkl.com
 
+[UrlParser]
+# proxy_prefix                = https://proxy.lib.leafygreenuniversity.edu/login?url=
+# proxy_prefix_common_part    = ?url=
+# proxy_report_no_prefix      = True
+# proxy_report_wrong_prefix   = True
+
 [Logging]
 log_file                    = folio_bad_urls.log
diff --git a/folio_bad_urls/data.py b/folio_bad_urls/data.py
@@ -10,22 +10,42 @@ def __repr__(self):
 class TestResult:
     """ The result of testing a URL from a record. """ 
 
-    def __init__(self, instance_hrid, url, status_code, permanent_redirect=None):
+    def __init__(self, instance_hrid, url, status_code, permanent_redirect=None,
+        parser_result=None):
+
         self.instance_hrid = instance_hrid
         self.url = url
         self.status_code = status_code
         self.permanent_redirect = permanent_redirect
-
-    def is_insecure_url(self):
-        return not self.url.startswith("https:")
+        self._parser_result = parser_result
 
     def is_bad_url(self):
         return self.status_code != 200
 
+    def parser_result(self):
+        return self._parser_result
+
     def __repr__(self):
         return str(self.__dict__)
 
 class LocalStatusCode:
     CONNECTION_FAILED               = 0
     ROBOTS_TXT_BLOCKS_URL           = -10
     ROBOTS_TXT_TIMEOUT_EXCESSIVE    = -11
+
+class ParserResult:
+    """ The result of string scanning of a URL from a record. """
+
+    def __init__(self, *, insecure_url, no_proxy_prefix, wrong_proxy_prefix):
+        self._insecure_url = insecure_url
+        self._no_proxy_prefix = no_proxy_prefix
+        self._wrong_proxy_prefix = wrong_proxy_prefix
+
+    def is_insecure_url(self):
+        return self._insecure_url
+
+    def has_no_proxy_prefix(self):
+        return self._no_proxy_prefix
+
+    def has_wrong_proxy_prefix(self):
+        return self._wrong_proxy_prefix
diff --git a/folio_bad_urls/reporter.py b/folio_bad_urls/reporter.py
@@ -3,20 +3,29 @@
 log = logging.getLogger(__name__)
 log.setLevel(logging.INFO)
 
-HEADER = "instance_hrid, url, status_code, insecure_url, permanent_redirect\n"
-
 class Reporter:
     """ Save bad URLs to a file. """
 
+    STANDARD_HEADER_FIELDS = [ 
+        "instance_hrid",
+        "url", 
+        "status_code", 
+        "permanent_redirect",
+        "insecure_url", 
+    ]
+
     def __init__(self, config):
         self._config = config
         log.addHandler(self._config.log_file_handler)
 
+        self._PROXY_REPORT_NO_PREFIX = bool(config.get('UrlParser', 'proxy_report_no_prefix', fallback=False))
+        self._PROXY_REPORT_WRONG_PREFIX = bool(config.get('UrlParser', 'proxy_report_wrong_prefix', fallback=False))
+
     def write_results(self, offset, results):
         filename = f'result_{str(offset)}.csv'
         bad_urls = 0
         with open(filename, 'w') as file:
-            file.write(HEADER)
+            file.write(self._format_header())
             for result in results:
                 if result.is_bad_url():
                     bad_urls += 1
@@ -25,10 +34,27 @@ def write_results(self, offset, results):
         log.info(f"Wrote file with {bad_urls} bad URLs.")
         return bad_urls
 
+    def _format_header(self):
+        header_fields = Reporter.STANDARD_HEADER_FIELDS
+        if self._PROXY_REPORT_NO_PREFIX:
+            header_fields.append("proxy_no_prefix")
+        if self._PROXY_REPORT_WRONG_PREFIX:
+            header_fields.append("proxy_wrong_prefix")
+        return ", ".join(header_fields) + "\n"
+
     def _format_result(self, result):
-        return f"{result.instance_hrid}, \
+        result_string = f"{result.instance_hrid}, \
             {result.url}, \
             {result.status_code}, \
-            {result.is_insecure_url() if result.is_insecure_url() else ''}, \
-            {result.permanent_redirect if result.permanent_redirect else ''} \
-            \n"
+            {result.permanent_redirect if result.permanent_redirect else ''}, \
+            {self._format_bool(result.parser_result().is_insecure_url())} \
+            "
+        if self._PROXY_REPORT_NO_PREFIX:
+            result_string += self._format_bool(result.parser_result().has_no_proxy_prefix())
+        if self._PROXY_REPORT_WRONG_PREFIX:
+            result_string += self._format_bool(result.parser_result().has_wrong_proxy_prefix())
+        result_string += "\n"
+        return result_string
+
+    def _format_bool(self, value):
+        return 'Y,' if value else ','
diff --git a/folio_bad_urls/url_parser.py b/folio_bad_urls/url_parser.py
@@ -0,0 +1,44 @@
+import logging
+
+from folio_bad_urls.data import ParserResult
+
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+# log.setLevel(logging.DEBUG)
+
+class UrlParser:
+    """ Parse URLs for lexical result information. """
+
+    def __init__(self, config):
+        self._PROXY_PREFIX = config.get('UrlParser', 'proxy_prefix', fallback=None)
+        self._PROXY_PREFIX_COMMON_PART = config.get('UrlParser', 'proxy_prefix_common_part', fallback="?url=")
+        self._PROXY_REPORT_NO_PREFIX = bool(config.get('UrlParser', 'proxy_report_no_prefix', fallback=False))
+        self._PROXY_REPORT_WRONG_PREFIX = bool(config.get('UrlParser', 'proxy_report_wrong_prefix', fallback=False))
+
+        # validate config
+        if self._PROXY_REPORT_NO_PREFIX and not self._PROXY_PREFIX:
+            raise Exception("Parameter proxy_prefix required for proxy_report_no_prefix")
+        if self._PROXY_REPORT_WRONG_PREFIX and not self._PROXY_PREFIX:
+            raise Exception("Parameter proxy_prefix required for proxy_report_wrong_prefix")
+
+    def parse(self, url):
+        parser_result = ParserResult(
+            insecure_url=self._test_insecure_url(url),
+            no_proxy_prefix=self._test_no_proxy_prefix(url),
+            wrong_proxy_prefix=self._test_wrong_proxy_prefix(url)
+            )
+        return parser_result
+
+    def _test_insecure_url(self, url):
+        return not url.startswith("https:")
+
+    def _test_no_proxy_prefix(self, url):
+        if not self._PROXY_REPORT_NO_PREFIX:
+            return None
+        return not url.startswith(self._PROXY_PREFIX)
+
+    def _test_wrong_proxy_prefix(self, url):
+        if not self._PROXY_REPORT_WRONG_PREFIX:
+            return None
+        return self._PROXY_PREFIX_COMMON_PART in url \
+            and not url.startswith(self._PROXY_PREFIX)
diff --git a/folio_bad_urls/web.py b/folio_bad_urls/web.py
@@ -5,6 +5,7 @@
 import urllib.robotparser
 
 from folio_bad_urls.data import ElectronicRecord, TestResult, LocalStatusCode
+from folio_bad_urls.url_parser import UrlParser
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.INFO)
@@ -25,6 +26,7 @@ def __init__(self, config):
         self._MAX_CRAWL_DELAY = float(self._config.get('WebTester', 'max_crawl_delay'))
         self._REQUEST_TIMEOUT = float(self._config.get('WebTester', 'request_timeout'))
 
+        self._parser = UrlParser(config)
         self._crawl_rules = dict()
         self._last_query_time = dict()
         self._init_filters()
@@ -46,6 +48,9 @@ def _init_filters(self):
     def test_record(self, record: ElectronicRecord):
         url = record.url
 
+        # check static URL parsing results
+        parser_result = self._parser.parse(url)
+
         # check local filters
         if not self._check_filters(url):
             log.debug(f"Skipping URL due to filters: {url}")
@@ -55,24 +60,29 @@ def test_record(self, record: ElectronicRecord):
         rules = self._check_crawl_rules(url)
         if not rules.can_fetch(url):
             log.warn(f"Robots.txt blocks URL: {url}")
-            return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_BLOCKS_URL)
+            return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_BLOCKS_URL,
+                parser_result=parser_result)
         pause_ok = self._pause_if_needed(url, rules)
         if not pause_ok:
-            return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_TIMEOUT_EXCESSIVE)
+            return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_TIMEOUT_EXCESSIVE,
+                parser_result=parser_result)
 
         # load URL and check response
         try:
             response = requests.get(url, timeout = self._REQUEST_TIMEOUT, headers = WebTester.HEADERS)
             status_code = int(response.status_code)
             last_permanent_redirect = self._get_last_permanent_redirect(response, url)
             log.debug(f"Got status code {status_code} for url {url}")
-            return TestResult(record.instance_hrid, url, status_code, permanent_redirect=last_permanent_redirect)
+            return TestResult(record.instance_hrid, url, status_code, permanent_redirect=last_permanent_redirect,
+                parser_result=parser_result)
         except requests.exceptions.Timeout:
             log.debug(f"Request timed out for url {url}")
-            return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED)
+            return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED,
+                parser_result=parser_result)
         except requests.exceptions.RequestException as e:
             log.warn(f"Caught unexpected RequestException with url {url}: {e}")
-            return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED)
+            return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED,
+                parser_result=parser_result)
 
     def _check_filters(self, url):
         # check allow list