Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test proxy prefixes (absent or wrong) and refactor static URL parsing. #9

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ For connecting to and using the FOLIO APIs. All properties are **required**.
| tenant_id | FOLIO Tenant ID | Y |
| username | FOLIO username | Y |
| password | FOLIO password | Y |
| strategy | Name of the strategy to use. See [Folio Stratgy](#folio-strategy) below. | Y |
| strategy | Name of the strategy to use. See [Folio Strategy](#folio-strategy) below. | Y |
| query_limit | Number of records requested in each FOLIO API call. Note: for SrsInstanceIdsStrategy, this must be approximately 30 or lower so that the maximum query string length is not exceeded. | Y |
| batch_limit | Number of records tested for each output file. Must be equal to or a multiple of query_limit. The actual file will contain only those records which had bad URLs. | Y |

### WebTester Section

For testing each URL.
For loading and examining the response to each URL.

| Property | Description | Required |
|----------|-------------|---------|
Expand All @@ -42,6 +42,17 @@ For testing each URL.
| allow_list | Comma-separated list of strings. If present, only URLs including one of these strings will be tested. | N |
| block_list | Comma-separated list of strings. If present, URLs that include one of these strings will be skipped. `block_list` is ignored if `allow_list` is present. | N |

### UrlParser Section

String pattern tests examining the URL itself.

| Property | Description | Required |
|----------|-------------|---------|
| proxy_prefix | A proxy prefix string, used by the two `proxy_report*` parameters. | N |
| proxy_prefix_common_part | Default is `?url=`. A brief substring at the end of `proxy_prefix` that is also expected to be at the end of any incorrect prefix. | N |
| proxy_report_no_prefix | If True, the CSV will include a column flagging any URL that does not start with this prefix. | N |
| proxy_report_wrong_prefix | If True, the CSV will include a column flagging any URL which includes this substring but does not start with the full `proxy_prefix`. | N |

### Logging Section

| Property | Description | Required |
Expand Down
6 changes: 6 additions & 0 deletions example.properties
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,11 @@ request_timeout = 10
# allow_list = abc.com, def.com
# block_list = ghi.com, jkl.com

[UrlParser]
# proxy_prefix = https://proxy.lib.leafygreenuniversity.edu/login?url=
# proxy_prefix_common_part = ?url=
# proxy_report_no_prefix = True
# proxy_report_wrong_prefix = True

[Logging]
log_file = folio_bad_urls.log
28 changes: 24 additions & 4 deletions folio_bad_urls/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,42 @@ def __repr__(self):
class TestResult:
""" The result of testing a URL from a record. """

def __init__(self, instance_hrid, url, status_code, permanent_redirect=None):
def __init__(self, instance_hrid, url, status_code, permanent_redirect=None,
parser_result=None):

self.instance_hrid = instance_hrid
self.url = url
self.status_code = status_code
self.permanent_redirect = permanent_redirect

def is_insecure_url(self):
return not self.url.startswith("https:")
self._parser_result = parser_result

def is_bad_url(self):
return self.status_code != 200

def parser_result(self):
return self._parser_result

def __repr__(self):
return str(self.__dict__)

class LocalStatusCode:
CONNECTION_FAILED = 0
ROBOTS_TXT_BLOCKS_URL = -10
ROBOTS_TXT_TIMEOUT_EXCESSIVE = -11

class ParserResult:
""" The result of string scanning of a URL from a record. """

def __init__(self, *, insecure_url, no_proxy_prefix, wrong_proxy_prefix):
self._insecure_url = insecure_url
self._no_proxy_prefix = no_proxy_prefix
self._wrong_proxy_prefix = wrong_proxy_prefix

def is_insecure_url(self):
return self._insecure_url

def has_no_proxy_prefix(self):
return self._no_proxy_prefix

def has_wrong_proxy_prefix(self):
return self._wrong_proxy_prefix
40 changes: 33 additions & 7 deletions folio_bad_urls/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,29 @@
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

HEADER = "instance_hrid, url, status_code, insecure_url, permanent_redirect\n"

class Reporter:
""" Save bad URLs to a file. """

STANDARD_HEADER_FIELDS = [
"instance_hrid",
"url",
"status_code",
"permanent_redirect",
"insecure_url",
]

def __init__(self, config):
self._config = config
log.addHandler(self._config.log_file_handler)

self._PROXY_REPORT_NO_PREFIX = bool(config.get('UrlParser', 'proxy_report_no_prefix', fallback=False))
self._PROXY_REPORT_WRONG_PREFIX = bool(config.get('UrlParser', 'proxy_report_wrong_prefix', fallback=False))

def write_results(self, offset, results):
filename = f'result_{str(offset)}.csv'
bad_urls = 0
with open(filename, 'w') as file:
file.write(HEADER)
file.write(self._format_header())
for result in results:
if result.is_bad_url():
bad_urls += 1
Expand All @@ -25,10 +34,27 @@ def write_results(self, offset, results):
log.info(f"Wrote file with {bad_urls} bad URLs.")
return bad_urls

def _format_header(self):
header_fields = Reporter.STANDARD_HEADER_FIELDS
if self._PROXY_REPORT_NO_PREFIX:
header_fields.append("proxy_no_prefix")
if self._PROXY_REPORT_WRONG_PREFIX:
header_fields.append("proxy_wrong_prefix")
return ", ".join(header_fields) + "\n"

def _format_result(self, result):
return f"{result.instance_hrid}, \
result_string = f"{result.instance_hrid}, \
{result.url}, \
{result.status_code}, \
{result.is_insecure_url() if result.is_insecure_url() else ''}, \
{result.permanent_redirect if result.permanent_redirect else ''} \
\n"
{result.permanent_redirect if result.permanent_redirect else ''}, \
{self._format_bool(result.parser_result().is_insecure_url())} \
"
if self._PROXY_REPORT_NO_PREFIX:
result_string += self._format_bool(result.parser_result().has_no_proxy_prefix())
if self._PROXY_REPORT_WRONG_PREFIX:
result_string += self._format_bool(result.parser_result().has_wrong_proxy_prefix())
result_string += "\n"
return result_string

def _format_bool(self, value):
return 'Y,' if value else ','
44 changes: 44 additions & 0 deletions folio_bad_urls/url_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import logging

from folio_bad_urls.data import ParserResult

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
# log.setLevel(logging.DEBUG)

class UrlParser:
""" Parse URLs for lexical result information. """

def __init__(self, config):
self._PROXY_PREFIX = config.get('UrlParser', 'proxy_prefix', fallback=None)
self._PROXY_PREFIX_COMMON_PART = config.get('UrlParser', 'proxy_prefix_common_part', fallback="?url=")
self._PROXY_REPORT_NO_PREFIX = bool(config.get('UrlParser', 'proxy_report_no_prefix', fallback=False))
self._PROXY_REPORT_WRONG_PREFIX = bool(config.get('UrlParser', 'proxy_report_wrong_prefix', fallback=False))

# validate config
if self._PROXY_REPORT_NO_PREFIX and not self._PROXY_PREFIX:
raise Exception("Parameter proxy_prefix required for proxy_report_no_prefix")
if self._PROXY_REPORT_WRONG_PREFIX and not self._PROXY_PREFIX:
raise Exception("Parameter proxy_prefix required for proxy_report_wrong_prefix")

def parse(self, url):
parser_result = ParserResult(
insecure_url=self._test_insecure_url(url),
no_proxy_prefix=self._test_no_proxy_prefix(url),
wrong_proxy_prefix=self._test_wrong_proxy_prefix(url)
)
return parser_result

def _test_insecure_url(self, url):
return not url.startswith("https:")

def _test_no_proxy_prefix(self, url):
if not self._PROXY_REPORT_NO_PREFIX:
return None
return not url.startswith(self._PROXY_PREFIX)

def _test_wrong_proxy_prefix(self, url):
if not self._PROXY_REPORT_WRONG_PREFIX:
return None
return self._PROXY_PREFIX_COMMON_PART in url \
and not url.startswith(self._PROXY_PREFIX)
20 changes: 15 additions & 5 deletions folio_bad_urls/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import urllib.robotparser

from folio_bad_urls.data import ElectronicRecord, TestResult, LocalStatusCode
from folio_bad_urls.url_parser import UrlParser

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
Expand All @@ -25,6 +26,7 @@ def __init__(self, config):
self._MAX_CRAWL_DELAY = float(self._config.get('WebTester', 'max_crawl_delay'))
self._REQUEST_TIMEOUT = float(self._config.get('WebTester', 'request_timeout'))

self._parser = UrlParser(config)
self._crawl_rules = dict()
self._last_query_time = dict()
self._init_filters()
Expand All @@ -46,6 +48,9 @@ def _init_filters(self):
def test_record(self, record: ElectronicRecord):
url = record.url

# check static URL parsing results
parser_result = self._parser.parse(url)

# check local filters
if not self._check_filters(url):
log.debug(f"Skipping URL due to filters: {url}")
Expand All @@ -55,24 +60,29 @@ def test_record(self, record: ElectronicRecord):
rules = self._check_crawl_rules(url)
if not rules.can_fetch(url):
log.warn(f"Robots.txt blocks URL: {url}")
return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_BLOCKS_URL)
return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_BLOCKS_URL,
parser_result=parser_result)
pause_ok = self._pause_if_needed(url, rules)
if not pause_ok:
return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_TIMEOUT_EXCESSIVE)
return TestResult(record.instance_hrid, url, LocalStatusCode.ROBOTS_TXT_TIMEOUT_EXCESSIVE,
parser_result=parser_result)

# load URL and check response
try:
response = requests.get(url, timeout = self._REQUEST_TIMEOUT, headers = WebTester.HEADERS)
status_code = int(response.status_code)
last_permanent_redirect = self._get_last_permanent_redirect(response, url)
log.debug(f"Got status code {status_code} for url {url}")
return TestResult(record.instance_hrid, url, status_code, permanent_redirect=last_permanent_redirect)
return TestResult(record.instance_hrid, url, status_code, permanent_redirect=last_permanent_redirect,
parser_result=parser_result)
except requests.exceptions.Timeout:
log.debug(f"Request timed out for url {url}")
return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED)
return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED,
parser_result=parser_result)
except requests.exceptions.RequestException as e:
log.warn(f"Caught unexpected RequestException with url {url}: {e}")
return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED)
return TestResult(record.instance_hrid, url, LocalStatusCode.CONNECTION_FAILED,
parser_result=parser_result)

def _check_filters(self, url):
# check allow list
Expand Down