From ee87f714a6bbec5e39ad7e00d871bedbb9d8cb45 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 23 Jul 2020 14:48:59 +1000 Subject: [PATCH 01/28] [DQL2-6] download archive to a temporary file so we can do type sniffing - This uses the external URL without an API key, so it will not work on private datasets, but it is otherwise fairly reliable. --- ckanext/qa/tasks.py | 179 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 152 insertions(+), 27 deletions(-) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index 26fe8289..38873591 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -4,15 +4,23 @@ ''' import datetime import json +import math import os +import tempfile +import time import traceback import urlparse import routes -from ckan.common import _ +import requests +from ckan.common import _ from ckan.lib import i18n from ckan.plugins import toolkit +try: + from ckan.plugins.toolkit import config +except ImportError: + from pylons import config import ckan.lib.helpers as ckan_helpers from sniff_format import sniff_file_format import lib @@ -22,6 +30,11 @@ log = logging.getLogger(__name__) +SSL_VERIFY = True +MAX_CONTENT_LENGTH = int(config.get('ckanext.qa.max_content_length', 1e7)) +CHUNK_SIZE = 16 * 1024 # 16kb +DOWNLOAD_TIMEOUT = 30 + if toolkit.check_ckan_version(max_version='2.6.99'): from ckan.lib import celery_app @@ -352,35 +365,147 @@ def score_by_sniffing_data(archival, resource, score_reasons): return (None, None) # Analyse the cached file filepath = archival.cache_filepath + delete_file = False if not os.path.exists(filepath): - score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath) - return (None, None) - else: - if filepath: - sniffed_format = sniff_file_format(filepath) - score = lib.resource_format_scores().get(sniffed_format['format']) \ - if sniffed_format else None - if sniffed_format: - score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.') - % (sniffed_format['format'], score)) - return score, sniffed_format['format'] - else: - score_reasons.append(_('The format of the file was not recognized from its contents.')) + log.debug("File not found on disk for resource %s", resource) + if resource.url_type == 'upload': + try: + resource_dict = toolkit.get_action('resource_show')(None, {'id': resource.id}) + filepath = _download_url(resource_dict['url']).name + delete_file = True + except Exception as e: + score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e) return (None, None) else: - # No cache_url - if archival.status_id == Status.by_text('Chose not to download'): - score_reasons.append(_('File was not downloaded deliberately') + '. ' - + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) - return (None, None) - elif archival.is_broken is None and archival.status_id: - # i.e. 'Download failure' or 'System error during archival' - score_reasons.append(_('A system error occurred during downloading this file') + '. ' - + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) - return (None, None) - else: - score_reasons.append(_('This file had not been downloaded at the time of scoring it.')) - return (None, None) + score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath) + return (None, None) + if filepath: + sniffed_format = sniff_file_format(filepath) + if delete_file: + try: + os.remove(filepath) + except OSError as e: + log.warn("Unable to remove temporary file %s: %s", filepath, e) + score = lib.resource_format_scores().get(sniffed_format['format']) \ + if sniffed_format else None + if sniffed_format: + score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.') + % (sniffed_format['format'], score)) + return score, sniffed_format['format'] + else: + score_reasons.append(_('The format of the file was not recognized from its contents.')) + return (None, None) + else: + # No cache_url + if archival.status_id == Status.by_text('Chose not to download'): + score_reasons.append(_('File was not downloaded deliberately') + '. ' + + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) + return (None, None) + elif archival.is_broken is None and archival.status_id: + # i.e. 'Download failure' or 'System error during archival' + score_reasons.append(_('A system error occurred during downloading this file') + '. ' + + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) + return (None, None) + else: + score_reasons.append(_('This file had not been downloaded at the time of scoring it.')) + return (None, None) + + +def _download_url(url): + # check scheme + scheme = urlparse.urlsplit(url).scheme + if scheme not in ('http', 'https', 'ftp'): + raise IOError( + 'Only http, https, and ftp resources may be fetched.' + ) + + # fetch the resource data + log.info('Fetching from: {0}'.format(url)) + tmp_file = get_tmp_file(url) + length = 0 + cl = None + try: + headers = {} + response = get_response(url, headers) + + # download the file to a tempfile on disk + for chunk in response.iter_content(CHUNK_SIZE): + length += len(chunk) + if length > MAX_CONTENT_LENGTH: + log.warn("File size exceeds length limit %s, truncating", MAX_CONTENT_LENGTH) + break + tmp_file.write(chunk) + + except requests.exceptions.HTTPError as error: + # status code error + log.debug('HTTP error: {}'.format(error)) + tmp_file.close() + os.remove(tmp_file.name) + raise HTTPError( + "Received a bad HTTP response when trying to download " + "the data file", status_code=error.response.status_code, + request_url=url, response=error) + except requests.exceptions.Timeout: + log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) + tmp_file.close() + os.remove(tmp_file.name) + raise IOError('Connection timed out after {}s'.format( + DOWNLOAD_TIMEOUT)) + except requests.exceptions.RequestException as e: + try: + err_message = str(e.reason) + except AttributeError: + err_message = str(e) + log.warning('URL error: {}'.format(err_message)) + tmp_file.close() + os.remove(tmp_file.name) + raise HTTPError( + message=err_message, status_code=None, + request_url=url, response=None) + + log.info('Downloaded ok - %s', printable_file_size(length)) + tmp_file.seek(0) + return tmp_file + + +def get_response(url, headers): + def get_url(): + return requests.get( + url, + headers=headers, + timeout=DOWNLOAD_TIMEOUT, + verify=SSL_VERIFY, + stream=True, # just gets the headers for now + ) + response = get_url() + if response.status_code == 202: + # Seen: https://data-cdfw.opendata.arcgis.com/datasets + # In this case it means it's still processing, so do retries. + # 202 can mean other things, but there's no harm in retries. + wait = 1 + while wait < 120 and response.status_code == 202: + # log.info('Retrying after {}s'.format(wait)) + time.sleep(wait) + response = get_url() + wait *= 3 + response.raise_for_status() + return response + + +def get_tmp_file(url): + filename = url.split('/')[-1].split('#')[0].split('?')[0] + tmp_file = tempfile.NamedTemporaryFile(suffix=filename, delete=False) + return tmp_file + + +def printable_file_size(size_bytes): + if size_bytes == 0: + return '0 bytes' + size_name = ('bytes', 'KB', 'MB', 'GB', 'TB') + i = int(math.floor(math.log(size_bytes, 1024))) + p = math.pow(1024, i) + s = round(size_bytes / p, 1) + return "%s %s" % (s, size_name[i]) def score_by_url_extension(resource, score_reasons): From a3a3e401b68df4b71f843c1dd52dde0e428159e2 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 23 Jul 2020 16:06:56 +1000 Subject: [PATCH 02/28] [DQL2-6] use archival URL when not on disk, regardless of whether it's an upload - This appears to point to either the download URL, or the cache URL, as needed --- ckanext/qa/tasks.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index 38873591..b3ba6d6b 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -367,18 +367,15 @@ def score_by_sniffing_data(archival, resource, score_reasons): filepath = archival.cache_filepath delete_file = False if not os.path.exists(filepath): - log.debug("File not found on disk for resource %s", resource) - if resource.url_type == 'upload': - try: - resource_dict = toolkit.get_action('resource_show')(None, {'id': resource.id}) - filepath = _download_url(resource_dict['url']).name - delete_file = True - except Exception as e: - score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e) - return (None, None) - else: - score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath) + log.debug("%s not found on disk, retrieving from URL %s", + filepath, archival.cache_url) + try: + filepath = _download_url(archival.cache_url).name + delete_file = True + except Exception as e: + score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e) return (None, None) + if filepath: sniffed_format = sniff_file_format(filepath) if delete_file: From 9e4e47ee523f1d2e7aa26dea02e0e9eccf37064d Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 23 Jul 2020 16:19:02 +1000 Subject: [PATCH 03/28] [DQL2-6] ensure that we try to clean up the temporary file even on error --- ckanext/qa/tasks.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index b3ba6d6b..f7515db0 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -377,12 +377,14 @@ def score_by_sniffing_data(archival, resource, score_reasons): return (None, None) if filepath: - sniffed_format = sniff_file_format(filepath) - if delete_file: - try: - os.remove(filepath) - except OSError as e: - log.warn("Unable to remove temporary file %s: %s", filepath, e) + try: + sniffed_format = sniff_file_format(filepath) + finally: + if delete_file: + try: + os.remove(filepath) + except OSError as e: + log.warn("Unable to remove temporary file %s: %s", filepath, e) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: From d02993ccad8a4748604a514be072de97df0fb4dd Mon Sep 17 00:00:00 2001 From: william dutton Date: Thu, 30 Jul 2020 11:34:22 +1000 Subject: [PATCH 04/28] Use utc timezone so storage and helper functions work correctly --- ckanext/qa/bin/migrate_task_status.py | 3 ++- ckanext/qa/bin/running_stats.py | 6 ++++-- ckanext/qa/model.py | 5 +++-- ckanext/qa/tasks.py | 4 +++- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py index f57b1bf5..71a254f7 100644 --- a/ckanext/qa/bin/migrate_task_status.py +++ b/ckanext/qa/bin/migrate_task_status.py @@ -12,6 +12,7 @@ import datetime import common +import pytz from running_stats import StatsList # pip install 'ProgressBar==2.3' @@ -19,7 +20,7 @@ START_OF_TIME = datetime.datetime(1980, 1, 1) END_OF_TIME = datetime.datetime(9999, 12, 31) -TODAY = datetime.datetime.now() +TODAY = datetime.datetime.now(tzinfo=pytz.utc) # NB put no CKAN imports here, or logging breaks diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py index 947797aa..53abd07c 100644 --- a/ckanext/qa/bin/running_stats.py +++ b/ckanext/qa/bin/running_stats.py @@ -35,6 +35,8 @@ import copy import datetime +import pytz + class StatsCount(dict): # {category:count} @@ -42,7 +44,7 @@ class StatsCount(dict): report_value_limit = 150 def __init__(self, *args, **kwargs): - self._start_time = datetime.datetime.now() + self._start_time = datetime.datetime.now(tzinfo=pytz.utc) super(StatsCount, self).__init__(*args, **kwargs) def _init_category(self, category): @@ -80,7 +82,7 @@ def report(self, indent=1, order_by_title=False, show_time_taken=True): lines = [indent_str + 'None'] if show_time_taken: - time_taken = datetime.datetime.now() - self._start_time + time_taken = datetime.datetime.now(tzinfo=pytz.utc) - self._start_time lines.append(indent_str + 'Time taken (h:m:s): %s' % time_taken) return '\n'.join(lines) diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py index 9e6b97a1..4fc4a801 100644 --- a/ckanext/qa/model.py +++ b/ckanext/qa/model.py @@ -1,6 +1,7 @@ import uuid import datetime +import pytz from sqlalchemy import Column from sqlalchemy import types from sqlalchemy.ext.declarative import declarative_base @@ -35,8 +36,8 @@ class QA(Base): openness_score_reason = Column(types.UnicodeText) format = Column(types.UnicodeText) - created = Column(types.DateTime, default=datetime.datetime.now) - updated = Column(types.DateTime, default=datetime.datetime.now) + created = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc)) + updated = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc)) def __repr__(self): summary = 'score=%s format=%s' % (self.openness_score, self.format) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index f7515db0..3922b740 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -9,6 +9,8 @@ import tempfile import time import traceback + +import pytz import urlparse import routes @@ -606,7 +608,7 @@ def save_qa_result(resource, qa_result): import ckan.model as model from ckanext.qa.model import QA - now = datetime.datetime.now() + now = datetime.datetime.now(tzinfo=pytz.utc) qa = QA.get_for_resource(resource.id) if not qa: From ad18da232de717f1557e57c9f840252a8de7702c Mon Sep 17 00:00:00 2001 From: william dutton Date: Thu, 30 Jul 2020 12:01:46 +1000 Subject: [PATCH 05/28] Use utc timezone so storage and helper functions work correctly --- ckanext/qa/model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py index 4fc4a801..160b0ad4 100644 --- a/ckanext/qa/model.py +++ b/ckanext/qa/model.py @@ -1,7 +1,6 @@ import uuid import datetime -import pytz from sqlalchemy import Column from sqlalchemy import types from sqlalchemy.ext.declarative import declarative_base @@ -36,8 +35,8 @@ class QA(Base): openness_score_reason = Column(types.UnicodeText) format = Column(types.UnicodeText) - created = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc)) - updated = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc)) + created = Column(types.DateTime, default=datetime.datetime.utcnow) + updated = Column(types.DateTime, default=datetime.datetime.utcnow) def __repr__(self): summary = 'score=%s format=%s' % (self.openness_score, self.format) From 3529c722a9c53beaf380f919646803bdbf049c97 Mon Sep 17 00:00:00 2001 From: william dutton Date: Thu, 30 Jul 2020 13:19:44 +1000 Subject: [PATCH 06/28] Use utc timezone so storage and helper functions work correctly --- ckanext/qa/bin/migrate_task_status.py | 2 +- ckanext/qa/bin/running_stats.py | 4 ++-- ckanext/qa/tasks.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py index 71a254f7..7b58e83c 100644 --- a/ckanext/qa/bin/migrate_task_status.py +++ b/ckanext/qa/bin/migrate_task_status.py @@ -20,7 +20,7 @@ START_OF_TIME = datetime.datetime(1980, 1, 1) END_OF_TIME = datetime.datetime(9999, 12, 31) -TODAY = datetime.datetime.now(tzinfo=pytz.utc) +TODAY = datetime.datetime.now(tz=pytz.utc) # NB put no CKAN imports here, or logging breaks diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py index 53abd07c..c6e6d538 100644 --- a/ckanext/qa/bin/running_stats.py +++ b/ckanext/qa/bin/running_stats.py @@ -44,7 +44,7 @@ class StatsCount(dict): report_value_limit = 150 def __init__(self, *args, **kwargs): - self._start_time = datetime.datetime.now(tzinfo=pytz.utc) + self._start_time = datetime.datetime.now(tz=pytz.utc) super(StatsCount, self).__init__(*args, **kwargs) def _init_category(self, category): @@ -82,7 +82,7 @@ def report(self, indent=1, order_by_title=False, show_time_taken=True): lines = [indent_str + 'None'] if show_time_taken: - time_taken = datetime.datetime.now(tzinfo=pytz.utc) - self._start_time + time_taken = datetime.datetime.now(tz=pytz.utc) - self._start_time lines.append(indent_str + 'Time taken (h:m:s): %s' % time_taken) return '\n'.join(lines) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index 3922b740..5118276f 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -608,7 +608,7 @@ def save_qa_result(resource, qa_result): import ckan.model as model from ckanext.qa.model import QA - now = datetime.datetime.now(tzinfo=pytz.utc) + now = datetime.datetime.now(tz=pytz.utc) qa = QA.get_for_resource(resource.id) if not qa: From 41c108de9e03c4c79e5ac17a812a4fb0da672c12 Mon Sep 17 00:00:00 2001 From: william dutton Date: Thu, 30 Jul 2020 13:36:03 +1000 Subject: [PATCH 07/28] all internal datetimes in ckan are utc --- ckanext/qa/bin/migrate_task_status.py | 3 +-- ckanext/qa/bin/running_stats.py | 7 ++----- ckanext/qa/tasks.py | 3 +-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py index 7b58e83c..f9c7d59e 100644 --- a/ckanext/qa/bin/migrate_task_status.py +++ b/ckanext/qa/bin/migrate_task_status.py @@ -12,7 +12,6 @@ import datetime import common -import pytz from running_stats import StatsList # pip install 'ProgressBar==2.3' @@ -20,7 +19,7 @@ START_OF_TIME = datetime.datetime(1980, 1, 1) END_OF_TIME = datetime.datetime(9999, 12, 31) -TODAY = datetime.datetime.now(tz=pytz.utc) +TODAY = datetime.datetime.utcnow() # NB put no CKAN imports here, or logging breaks diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py index c6e6d538..f4abe5ba 100644 --- a/ckanext/qa/bin/running_stats.py +++ b/ckanext/qa/bin/running_stats.py @@ -35,16 +35,13 @@ import copy import datetime -import pytz - - class StatsCount(dict): # {category:count} _init_value = 0 report_value_limit = 150 def __init__(self, *args, **kwargs): - self._start_time = datetime.datetime.now(tz=pytz.utc) + self._start_time = datetime.datetime.utcnow() super(StatsCount, self).__init__(*args, **kwargs) def _init_category(self, category): @@ -82,7 +79,7 @@ def report(self, indent=1, order_by_title=False, show_time_taken=True): lines = [indent_str + 'None'] if show_time_taken: - time_taken = datetime.datetime.now(tz=pytz.utc) - self._start_time + time_taken = datetime.datetime.utcnow() - self._start_time lines.append(indent_str + 'Time taken (h:m:s): %s' % time_taken) return '\n'.join(lines) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index 5118276f..538d7b43 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -10,7 +10,6 @@ import time import traceback -import pytz import urlparse import routes @@ -608,7 +607,7 @@ def save_qa_result(resource, qa_result): import ckan.model as model from ckanext.qa.model import QA - now = datetime.datetime.now(tz=pytz.utc) + now = datetime.datetime.utcnow() qa = QA.get_for_resource(resource.id) if not qa: From fefb61d1918b6948e575312bb2416d31df01c7d2 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 11 Feb 2021 12:57:49 +1000 Subject: [PATCH 08/28] [QOL-6491] enable file downloads to go through a proxy if needed --- ckanext/qa/tasks.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index 538d7b43..d96c954e 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -470,13 +470,12 @@ def _download_url(url): def get_response(url, headers): def get_url(): - return requests.get( - url, - headers=headers, - timeout=DOWNLOAD_TIMEOUT, - verify=SSL_VERIFY, - stream=True, # just gets the headers for now - ) + kwargs = {'headers': headers, 'timeout': DOWNLOAD_TIMEOUT, + 'verify': SSL_VERIFY, 'stream': True} # just gets the headers for now + if 'ckan.download_proxy' in config: + proxy = config.get('ckan.download_proxy') + kwargs['proxies'] = {'http': proxy, 'https': proxy} + return requests.get(url, **kwargs) response = get_url() if response.status_code == 202: # Seen: https://data-cdfw.opendata.arcgis.com/datasets From dc4430b978b83e2748146dae38ac8ab87181d6a6 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 11 Feb 2021 14:03:11 +1000 Subject: [PATCH 09/28] [QOL-6491] cleanup - add Flake8 config and make it pass - use 'six' to prepare for Python 3 --- .flake8 | 20 +++++++++ ckanext/qa/bin/common.py | 2 +- ckanext/qa/bin/migrate_task_status.py | 10 ++--- ckanext/qa/bin/running_stats.py | 9 ++-- ckanext/qa/commands.py | 57 ++++++++++++------------- ckanext/qa/controllers.py | 2 +- ckanext/qa/lib.py | 4 +- ckanext/qa/logic/action.py | 2 +- ckanext/qa/model.py | 5 ++- ckanext/qa/plugin.py | 6 +-- ckanext/qa/reports.py | 10 ++--- ckanext/qa/sniff_format.py | 58 ++++++++++++++------------ ckanext/qa/tasks.py | 30 ++++++------- ckanext/qa/tests/fake_ckan.py | 4 +- ckanext/qa/tests/mock_remote_server.py | 9 ++-- ckanext/qa/tests/test_link_checker.py | 4 +- ckanext/qa/tests/test_sniff_format.py | 4 +- ckanext/qa/tests/test_tasks.py | 8 ++-- 18 files changed, 136 insertions(+), 108 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..a89a787b --- /dev/null +++ b/.flake8 @@ -0,0 +1,20 @@ +[flake8] +# @see https://flake8.pycqa.org/en/latest/user/configuration.html?highlight=.flake8 + +exclude = + ckan + scripts + +# Extended output format. +format = pylint + +# Show the source of errors. +show_source = True + +max-complexity = 10 + +# List ignore rules one per line. +ignore = + E501 + C901 + W503 diff --git a/ckanext/qa/bin/common.py b/ckanext/qa/bin/common.py index 0ace784e..75e884fc 100644 --- a/ckanext/qa/bin/common.py +++ b/ckanext/qa/bin/common.py @@ -48,5 +48,5 @@ def get_resources(state='active', publisher_ref=None, resource_id=None, dataset_ resources = resources.filter(model.Resource.id == resource_id) criteria.append('Resource:%s' % resource_id) resources = resources.all() - print '%i resources (%s)' % (len(resources), ' '.join(criteria)) + print('%i resources (%s)' % (len(resources), ' '.join(criteria))) return resources diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py index f9c7d59e..125190b1 100644 --- a/ckanext/qa/bin/migrate_task_status.py +++ b/ckanext/qa/bin/migrate_task_status.py @@ -59,7 +59,7 @@ def migrate(options): # time, so some timezone nonesense going on. Can't do much. archival = Archival.get_for_resource(res.id) if not archival: - print add_stat('QA but no Archival data', res, stats) + print(add_stat('QA but no Archival data', res, stats)) continue archival_date = archival.updated # the state of the resource was as it was archived on the date of @@ -112,10 +112,10 @@ def migrate(options): model.Session.add(qa) add_stat('Added to QA table', res, stats) - print 'Summary\n', stats.report() + print('Summary\n', stats.report()) if options.write: model.repo.commit_and_remove() - print 'Written' + print('Written') def add_stat(outcome, res, stats, extra_info=None): @@ -154,10 +154,10 @@ def date_str_to_datetime_or_none(date_str): if len(args) != 1: parser.error('Wrong number of arguments (%i)' % len(args)) config_ini = args[0] - print 'Loading CKAN config...' + print('Loading CKAN config...') common.load_config(config_ini) common.register_translator() - print 'Done' + print('Done') # Setup logging to print debug out for local only rootLogger = logging.getLogger() rootLogger.setLevel(logging.WARNING) diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py index f4abe5ba..fbb0a635 100644 --- a/ckanext/qa/bin/running_stats.py +++ b/ckanext/qa/bin/running_stats.py @@ -14,7 +14,7 @@ package_stats.increment('deleted') else: package_stats.increment('not deleted') -print package_stats.report() +print(package_stats.report()) > deleted: 30 > not deleted: 70 @@ -26,7 +26,7 @@ package_stats.add('deleted', package.name) else: package_stats.add('not deleted' package.name) -print package_stats.report() +print(package_stats.report()) > deleted: 30 pollution-uk, flood-regions, river-quality, ... > not deleted: 70 spending-bristol, ... @@ -35,6 +35,7 @@ import copy import datetime + class StatsCount(dict): # {category:count} _init_value = 0 @@ -109,6 +110,6 @@ def report_value(self, category): package_stats.add('Success', 'good3') package_stats.add('Success', 'good4') package_stats.add('Failure', 'bad1') - print package_stats.report() + print(package_stats.report()) - print StatsList().report() + print(StatsList().report()) diff --git a/ckanext/qa/commands.py b/ckanext/qa/commands.py index 992fb0cd..d0b0b5ea 100644 --- a/ckanext/qa/commands.py +++ b/ckanext/qa/commands.py @@ -1,4 +1,5 @@ import logging +import six import sys from sqlalchemy import or_ @@ -65,7 +66,7 @@ def command(self): Parse command line arguments and call appropriate method. """ if not self.args or self.args[0] in ['--help', '-h', 'help']: - print QACommand.__doc__ + print(QACommand.__doc__) return cmd = self.args[0] @@ -177,44 +178,44 @@ def sniff(self): from ckanext.qa.sniff_format import sniff_file_format if len(self.args) < 2: - print 'Not enough arguments', self.args + print('Not enough arguments', self.args) sys.exit(1) for filepath in self.args[1:]: format_ = sniff_file_format( filepath, logging.getLogger('ckanext.qa.sniffer')) if format_: - print 'Detected as: %s - %s' % (format_['display_name'], - filepath) + print('Detected as: %s - %s' % (format_['display_name'], + filepath)) else: - print 'ERROR: Could not recognise format of: %s' % filepath + print('ERROR: Could not recognise format of: %s' % filepath) def view(self, package_ref=None): from ckan import model q = model.Session.query(model.TaskStatus).filter_by(task_type='qa') - print 'QA records - %i TaskStatus rows' % q.count() - print ' across %i Resources' % q.distinct('entity_id').count() + print('QA records - %i TaskStatus rows' % q.count()) + print(' across %i Resources' % q.distinct('entity_id').count()) if package_ref: pkg = model.Package.get(package_ref) - print 'Package %s %s' % (pkg.name, pkg.id) + print('Package %s %s' % (pkg.name, pkg.id)) for res in pkg.resources: - print 'Resource %s' % res.id + print('Resource %s' % res.id) for row in q.filter_by(entity_id=res.id): - print '* %s = %r error=%r' % (row.key, row.value, - row.error) + print('* %s = %r error=%r' % (row.key, row.value, + row.error)) def clean(self): from ckan import model - print 'Before:' + print('Before:') self.view() q = model.Session.query(model.TaskStatus).filter_by(task_type='qa') q.delete() model.Session.commit() - print 'After:' + print('After:') self.view() def migrate1(self): @@ -223,32 +224,32 @@ def migrate1(self): q_status = model.Session.query(model.TaskStatus) \ .filter_by(task_type='qa') \ .filter_by(key='status') - print '* %s with "status" will be deleted e.g. %s' % (q_status.count(), - q_status.first()) + print('* %s with "status" will be deleted e.g. %s' % (q_status.count(), + q_status.first())) q_failures = model.Session.query(model.TaskStatus) \ .filter_by(task_type='qa') \ .filter_by(key='openness_score_failure_count') - print '* %s with openness_score_failure_count to be deleted e.g.\n%s'\ - % (q_failures.count(), q_failures.first()) + print('* %s with openness_score_failure_count to be deleted e.g.\n%s' + % (q_failures.count(), q_failures.first())) q_score = model.Session.query(model.TaskStatus) \ .filter_by(task_type='qa') \ .filter_by(key='openness_score') - print '* %s with openness_score to migrate e.g.\n%s' % \ - (q_score.count(), q_score.first()) + print('* %s with openness_score to migrate e.g.\n%s' % + (q_score.count(), q_score.first())) q_reason = model.Session.query(model.TaskStatus) \ .filter_by(task_type='qa') \ .filter_by(key='openness_score_reason') - print '* %s with openness_score_reason to migrate e.g.\n%s' % \ - (q_reason.count(), q_reason.first()) - raw_input('Press Enter to continue') + print('* %s with openness_score_reason to migrate e.g.\n%s' % + (q_reason.count(), q_reason.first())) + six.input('Press Enter to continue') q_status.delete() model.Session.commit() - print '..."status" deleted' + print('..."status" deleted') q_failures.delete() model.Session.commit() - print '..."openness_score_failure_count" deleted' + print('..."openness_score_failure_count" deleted') for task_status in q_score: reason_task_status = q_reason \ @@ -265,15 +266,15 @@ def migrate1(self): 'reason': reason, 'format': None, 'is_broken': None, - }) + }) model.Session.commit() - print '..."openness_score" and "openness_score_reason" migrated' + print('..."openness_score" and "openness_score_reason" migrated') count = q_reason.count() q_reason.delete() model.Session.commit() - print '... %i remaining "openness_score_reason" deleted' % count + print('... %i remaining "openness_score_reason" deleted' % count) model.Session.flush() model.Session.remove() - print 'Migration succeeded' + print('Migration succeeded') diff --git a/ckanext/qa/controllers.py b/ckanext/qa/controllers.py index 493eed7f..4cedcbb5 100644 --- a/ckanext/qa/controllers.py +++ b/ckanext/qa/controllers.py @@ -102,7 +102,7 @@ def _check_link(self, url): result['mimetype'] = self._extract_mimetype(headers) result['size'] = headers.get('content-length', '') result['last_modified'] = self._parse_and_format_date(headers.get('last-modified', '')) - except LinkCheckerError, e: + except LinkCheckerError as e: result['url_errors'].append(str(e)) return result diff --git a/ckanext/qa/lib.py b/ckanext/qa/lib.py index 2113badd..712a8741 100644 --- a/ckanext/qa/lib.py +++ b/ckanext/qa/lib.py @@ -55,7 +55,7 @@ def resource_format_scores(): with open(json_filepath) as format_file: try: file_resource_formats = json.loads(format_file.read()) - except ValueError, e: + except ValueError as e: # includes simplejson.decoder.JSONDecodeError raise ValueError('Invalid JSON syntax in %s: %s' % (json_filepath, e)) @@ -90,7 +90,7 @@ def create_qa_update_package_task(package, queue): from pylons import config ckan_ini_filepath = os.path.abspath(config.__file__) - compat_enqueue('qa.update_package', tasks.update_package, queue, args=[ckan_ini_filepath, package.id]) + compat_enqueue('qa.update_package', tasks.update_package, queue, args=[ckan_ini_filepath, package.id]) log.debug('QA of package put into celery queue %s: %s', queue, package.name) diff --git a/ckanext/qa/logic/action.py b/ckanext/qa/logic/action.py index 8914c670..e176a7d4 100644 --- a/ckanext/qa/logic/action.py +++ b/ckanext/qa/logic/action.py @@ -30,7 +30,7 @@ def qa_resource_show(context, data_dict): 'name': pkg.name, 'title': pkg.title, 'id': res.id - } + } return_dict['archival'] = archival.as_dict() return_dict.update(qa.as_dict()) return return_dict diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py index 160b0ad4..94eafdc7 100644 --- a/ckanext/qa/model.py +++ b/ckanext/qa/model.py @@ -1,5 +1,6 @@ import uuid import datetime +import six from sqlalchemy import Column from sqlalchemy import types @@ -15,7 +16,7 @@ def make_uuid(): - return unicode(uuid.uuid4()) + return six.text_type(uuid.uuid4()) class QA(Base): @@ -40,7 +41,7 @@ class QA(Base): def __repr__(self): summary = 'score=%s format=%s' % (self.openness_score, self.format) - details = unicode(self.openness_score_reason).encode('unicode_escape') + details = six.text_type(self.openness_score_reason).encode('unicode_escape') package = model.Package.get(self.package_id) package_name = package.name if package else '?%s?' % self.package_id return '' % \ diff --git a/ckanext/qa/plugin.py b/ckanext/qa/plugin.py index 876459d1..cfd92766 100644 --- a/ckanext/qa/plugin.py +++ b/ckanext/qa/plugin.py @@ -67,7 +67,7 @@ def get_actions(self): return { 'qa_resource_show': action.qa_resource_show, 'qa_package_openness_show': action.qa_package_openness_show, - } + } # IAuthFunctions @@ -75,7 +75,7 @@ def get_auth_functions(self): return { 'qa_resource_show': auth.qa_resource_show, 'qa_package_openness_show': auth.qa_package_openness_show, - } + } # ITemplateHelpers @@ -85,7 +85,7 @@ def get_helpers(self): helpers.qa_openness_stars_resource_html, 'qa_openness_stars_dataset_html': helpers.qa_openness_stars_dataset_html, - } + } # IPackageController diff --git a/ckanext/qa/reports.py b/ckanext/qa/reports.py index c50b56de..9da09a64 100644 --- a/ckanext/qa/reports.py +++ b/ckanext/qa/reports.py @@ -72,7 +72,7 @@ def openness_index(include_sub_organizations=False): table = [] for org_name, org_counts in results.iteritems(): - total_stars = sum([k*v for k, v in org_counts['score_counts'].items() if k]) + total_stars = sum([k * v for k, v in org_counts['score_counts'].items() if k]) num_pkgs_scored = sum([v for k, v in org_counts['score_counts'].items() if k is not None]) average_stars = round(float(total_stars) / num_pkgs_scored, 1) \ @@ -82,7 +82,7 @@ def openness_index(include_sub_organizations=False): ('organization_name', org_name), ('total_stars', total_stars), ('average_stars', average_stars), - )) + )) row.update(jsonify_counter(org_counts['score_counts'])) table.append(row) @@ -136,10 +136,10 @@ def openness_for_organization(organization=None, include_sub_organizations=False ('organization_title', org.title), ('openness_score', qa['openness_score']), ('openness_score_reason', qa['openness_score_reason']), - ))) + ))) score_counts[qa['openness_score']] += 1 - total_stars = sum([k*v for k, v in score_counts.items() if k]) + total_stars = sum([k * v for k, v in score_counts.items() if k]) num_pkgs_with_stars = sum([v for k, v in score_counts.items() if k is not None]) average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \ @@ -172,7 +172,7 @@ def openness_report_combinations(): 'option_combinations': openness_report_combinations, 'generate': openness_report, 'template': 'report/openness.html', - } +} def jsonify_counter(counter): diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py index 856447fa..13b4dc6e 100644 --- a/ckanext/qa/sniff_format.py +++ b/ckanext/qa/sniff_format.py @@ -1,7 +1,9 @@ +# encoding: utf-8 import re import zipfile import os from collections import defaultdict +import six import subprocess import StringIO @@ -16,6 +18,7 @@ log = logging.getLogger(__name__) + def sniff_file_format(filepath): '''For a given filepath, work out what file format it is. @@ -33,12 +36,13 @@ def sniff_file_format(filepath): ''' format_ = None log.info('Sniffing file format of: %s', filepath) - filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ + filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, six.text_types) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: - if mime_type == 'application/xml': + # some operating systems magic mime xml as text/xml + if mime_type == 'application/xml' or mime_type == 'text/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf) @@ -139,14 +143,14 @@ def is_json(buf): JSON format.''' string = '"[^"]*"' string_re = re.compile(string) - number_re = re.compile('-?\d+(\.\d+)?([eE][+-]?\d+)?') - extra_values_re = re.compile('true|false|null') - object_start_re = re.compile('{%s:\s?' % string) - object_middle_re = re.compile('%s:\s?' % string) - object_end_re = re.compile('}') - comma_re = re.compile(',\s?') - array_start_re = re.compile('\[') - array_end_re = re.compile('\]') + number_re = re.compile(r'-?\d+(\.\d+)?([eE][+-]?\d+)?') + extra_values_re = re.compile(r'true|false|null') + object_start_re = re.compile(r'{%s:\s?' % string) + object_middle_re = re.compile(r'%s:\s?' % string) + object_end_re = re.compile(r'}') + comma_re = re.compile(r',\s?') + array_start_re = re.compile(r'\[') + array_end_re = re.compile(r'\]') any_value_regexs = [string_re, number_re, object_start_re, array_start_re, extra_values_re] # simplified state machine - just looks at stack of object/array and @@ -256,7 +260,7 @@ def get_cells_per_row(num_cells, num_rows): def is_html(buf): '''If this buffer is HTML, return that format type, else None.''' - xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(]*>\s*)?]*>' + xml_re = r'.{0,3}\s*(<\?xml[^>]*>\s*)?(]*>\s*)?]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('HTML tag detected') @@ -266,7 +270,7 @@ def is_html(buf): def is_iati(buf): '''If this buffer is IATI format, return that format type, else None.''' - xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(]*>\s*)?]*>' + xml_re = r'.{0,3}\s*(<\?xml[^>]*>\s*)?(]*>\s*)?]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('IATI tag detected') @@ -277,13 +281,13 @@ def is_iati(buf): def is_xml_but_without_declaration(buf): '''Decides if this is a buffer of XML, but missing the usual tag.''' - xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(]*>\s*)?<([^>\s]*)([^>]*)>' + xml_re = r'.{0,3}\s*(<\?xml[^>]*>\s*)?(]*>\s*)?<([^>\s]*)([^>]*)>' match = re.match(xml_re, buf, re.IGNORECASE) if match: top_level_tag_name, top_level_tag_attributes = match.groups()[-2:] - if 'xmlns:' not in top_level_tag_attributes and \ - (len(top_level_tag_name) > 20 or - len(top_level_tag_attributes) > 200): + if ('xmlns:' not in top_level_tag_attributes + and (len(top_level_tag_name) > 20 + or len(top_level_tag_attributes) > 200)): log.debug('Not XML (without declaration) - unlikely length first tag: <%s %s>', top_level_tag_name, top_level_tag_attributes) return False @@ -318,9 +322,9 @@ def start_element(name, attrs): p.StartElementHandler = start_element try: p.Parse(buf) - except GotFirstTag, e: - top_level_tag_name = str(e).lower() - except xml.sax.SAXException, e: + except GotFirstTag as e: + top_level_tag_name = six.text_type(e).lower() + except xml.sax.SAXException as e: log.info('Sax parse error: %s %s', e, buf) return {'format': 'XML'} @@ -354,8 +358,8 @@ def has_rdfa(buf): return False # more rigorous check for them as tag attributes - about_re = '<[^>]+\sabout="[^"]+"[^>]*>' - property_re = '<[^>]+\sproperty="[^"]+"[^>]*>' + about_re = r'<[^>]+\sabout="[^"]+"[^>]*>' + property_re = r'<[^>]+\sproperty="[^"]+"[^>]*>' # remove CR to catch tags spanning more than one line # buf = re.sub('\r\n', ' ', buf) if not re.search(about_re, buf): @@ -381,11 +385,11 @@ def get_zipped_format(filepath): filepaths = zip.namelist() finally: zip.close() - except zipfile.BadZipfile, e: + except zipfile.BadZipfile as e: log.info('Zip file open raised error %s: %s', e, e.args) return - except Exception, e: + except Exception as e: log.warning('Zip file open raised exception %s: %s', e, e.args) return @@ -438,7 +442,7 @@ def get_zipped_format(filepath): def is_excel(filepath): try: xlrd.open_workbook(filepath) - except Exception, e: + except Exception as e: log.info('Not Excel - failed to load: %s %s', e, e.args) return False else: @@ -534,12 +538,12 @@ def turtle_regex(): ''' global turtle_regex_ if not turtle_regex_: - rdf_term = '(<[^ >]+>|_:\S+|".+?"(@\w+)?(\^\^\S+)?|\'.+?\'(@\w+)?(\^\^\S+)?|""".+?"""(@\w+)' \ - '?(\^\^\S+)?|\'\'\'.+?\'\'\'(@\w+)?(\^\^\S+)?|[+-]?([0-9]+|[0-9]*\.[0-9]+)(E[+-]?[0-9]+)?|false|true)' + rdf_term = r'(<[^ >]+>|_:\S+|".+?"(@\w+)?(\^\^\S+)?|\'.+?\'(@\w+)?(\^\^\S+)?|""".+?"""(@\w+)' \ + r'?(\^\^\S+)?|\'\'\'.+?\'\'\'(@\w+)?(\^\^\S+)?|[+-]?([0-9]+|[0-9]*\.[0-9]+)(E[+-]?[0-9]+)?|false|true)' # simple case is: triple_re = '^T T T \.$'.replace('T', rdf_term) # but extend to deal with multiple predicate-objects: # triple = '^T T T\s*(;\s*T T\s*)*\.\s*$'.replace('T', rdf_term).replace(' ', '\s+') - triple = '(^T|;)\s*T T\s*(;|\.\s*$)'.replace('T', rdf_term).replace(' ', '\s+') + triple = r'(^T|;)\s*T T\s*(;|\.\s*$)'.replace('T', rdf_term).replace(' ', r'\s+') turtle_regex_ = re.compile(triple, re.MULTILINE) return turtle_regex_ diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index d96c954e..a9a857e8 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -6,6 +6,7 @@ import json import math import os +import six import tempfile import time import traceback @@ -129,9 +130,9 @@ def update_package(ckan_ini_filepath, package_id): try: update_package_(package_id) - except Exception, e: + except Exception as e: log.error('Exception occurred during QA update_package: %s: %s', - e.__class__.__name__, unicode(e)) + e.__class__.__name__, e) raise @@ -168,9 +169,9 @@ def update(ckan_ini_filepath, resource_id): load_config(ckan_ini_filepath) try: update_resource_(resource_id) - except Exception, e: + except Exception as e: log.error('Exception occurred during QA update_resource: %s: %s', - e.__class__.__name__, unicode(e)) + e.__class__.__name__, e) raise @@ -267,10 +268,10 @@ def resource_score(resource): format_ = get_qa_format(resource.id) score_reason = ' '.join(score_reasons) format_ = format_ or None - except Exception, e: + except Exception as e: log.error('Unexpected error while calculating openness score %s: %s\nException: %s', - e.__class__.__name__, unicode(e), traceback.format_exc()) - score_reason = _("Unknown error: %s") % str(e) + e.__class__.__name__, e, traceback.format_exc()) + score_reason = _("Unknown error: %s") % e raise # Even if we can get the link, we should still treat the resource @@ -310,7 +311,7 @@ def format_date(date): else: return '' messages = [_('File could not be downloaded.'), - _('Reason') + ':', unicode(archival.status) + '.', + _('Reason') + ':', six.text_type(archival.status) + '.', _('Error details: %s.') % archival.reason, _('Attempted on %s.') % format_date(archival.updated)] last_success = format_date(archival.last_success) @@ -423,7 +424,6 @@ def _download_url(url): log.info('Fetching from: {0}'.format(url)) tmp_file = get_tmp_file(url) length = 0 - cl = None try: headers = {} response = get_response(url, headers) @@ -441,16 +441,16 @@ def _download_url(url): log.debug('HTTP error: {}'.format(error)) tmp_file.close() os.remove(tmp_file.name) - raise HTTPError( - "Received a bad HTTP response when trying to download " - "the data file", status_code=error.response.status_code, + raise requests.exceptions.HTTPError( + "Received a bad HTTP response when trying to download the data file", + status_code=error.response.status_code, request_url=url, response=error) except requests.exceptions.Timeout: log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) tmp_file.close() os.remove(tmp_file.name) raise IOError('Connection timed out after {}s'.format( - DOWNLOAD_TIMEOUT)) + DOWNLOAD_TIMEOUT)) except requests.exceptions.RequestException as e: try: err_message = str(e.reason) @@ -459,7 +459,7 @@ def _download_url(url): log.warning('URL error: {}'.format(err_message)) tmp_file.close() os.remove(tmp_file.name) - raise HTTPError( + raise requests.exceptions.HTTPError( message=err_message, status_code=None, request_url=url, response=None) @@ -471,7 +471,7 @@ def _download_url(url): def get_response(url, headers): def get_url(): kwargs = {'headers': headers, 'timeout': DOWNLOAD_TIMEOUT, - 'verify': SSL_VERIFY, 'stream': True} # just gets the headers for now + 'verify': SSL_VERIFY, 'stream': True} # just gets the headers for now if 'ckan.download_proxy' in config: proxy = config.get('ckan.download_proxy') kwargs['proxies'] = {'http': proxy, 'https': proxy} diff --git a/ckanext/qa/tests/fake_ckan.py b/ckanext/qa/tests/fake_ckan.py index 30b85601..c8434cbc 100644 --- a/ckanext/qa/tests/fake_ckan.py +++ b/ckanext/qa/tests/fake_ckan.py @@ -10,12 +10,12 @@ 'last_success': '2008-10-01', 'first_failure': '', 'failure_count': 0, - }), + }), 'stack': '', 'last_updated': '2008-10-10T19:30:37.536836', } } - ) +) request_store = [] task_status = {'archiver': TASK_STATUS_ARCHIVER_OK, diff --git a/ckanext/qa/tests/mock_remote_server.py b/ckanext/qa/tests/mock_remote_server.py index b43fb77d..9c59ed5f 100644 --- a/ckanext/qa/tests/mock_remote_server.py +++ b/ckanext/qa/tests/mock_remote_server.py @@ -7,6 +7,7 @@ from time import sleep from wsgiref.simple_server import make_server import urllib2 +import six import socket @@ -37,7 +38,7 @@ def serve(self, host='localhost', port_range=(8000, 9000)): This uses context manager to make sure the server is stopped:: >>> with MockTestServer().serve() as addr: - ... print urllib2.urlopen('%s/?content=hello+world').read() + ... print(urllib2.urlopen('%s/?content=hello+world').read()) ... 'hello world' """ @@ -80,8 +81,8 @@ def get_content(cls, varspec): called and its return value used. """ modpath, var = varspec.split(':') - mod = reduce(getattr, modpath.split('.')[1:], __import__(modpath)) - var = reduce(getattr, var.split('.'), mod) + mod = six.moves.reduce(getattr, modpath.split('.')[1:], __import__(modpath)) + var = six.moves.reduce(getattr, var.split('.'), mod) try: return var() except TypeError: @@ -116,7 +117,7 @@ def __call__(self, environ, start_response): else: content = request.str_params.get('content', '') - if isinstance(content, unicode): + if isinstance(content, six.string_types): raise TypeError("Expected raw byte string for content") headers = [ diff --git a/ckanext/qa/tests/test_link_checker.py b/ckanext/qa/tests/test_link_checker.py index 550a016e..cd8a79a6 100644 --- a/ckanext/qa/tests/test_link_checker.py +++ b/ckanext/qa/tests/test_link_checker.py @@ -121,12 +121,12 @@ def test_colon_in_query_string(self, url): # accept, because browsers accept this # see discussion: http://trac.ckan.org/ticket/318 result = self.check_link(url) - print result + print(result) assert_equal(result['url_errors'], []) @with_mock_url('?status=200 ') def test_trailing_whitespace(self, url): # accept, because browsers accept this result = self.check_link(url) - print result + print(result) assert_equal(result['url_errors'], []) diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py index f7b86577..c3b7f279 100644 --- a/ckanext/qa/tests/test_sniff_format.py +++ b/ckanext/qa/tests/test_sniff_format.py @@ -292,5 +292,5 @@ def test_turtle_regex(): def test_is_ttl__num_triples(): triple = ' ; .' - assert not is_ttl('\n'.join([triple]*2)) - assert is_ttl('\n'.join([triple]*5)) + assert not is_ttl('\n'.join([triple] * 2)) + assert is_ttl('\n'.join([triple] * 5)) diff --git a/ckanext/qa/tests/test_tasks.py b/ckanext/qa/tests/test_tasks.py index 4a9bf3bd..9f59f207 100644 --- a/ckanext/qa/tests/test_tasks.py +++ b/ckanext/qa/tests/test_tasks.py @@ -78,7 +78,7 @@ def test_trigger_on_archival(cls): context = {'model': model, 'ignore_auth': True, 'session': model.Session, 'user': 'test'} pkg = {'name': 'testpkg', 'license_id': 'uk-ogl', 'resources': [ {'url': 'http://test.com/', 'format': 'CSV', 'description': 'Test'} - ]} + ]} pkg = get_action('package_create')(context, pkg) resource_dict = pkg['resources'][0] res_id = resource_dict['id'] @@ -304,7 +304,7 @@ def get_qa_result(cls, **kwargs): 'openness_score_reason': 'Detected as CSV which scores 3', 'format': 'CSV', 'archival_timestamp': datetime.datetime(2015, 12, 16), - } + } qa_result.update(kwargs) return qa_result @@ -335,7 +335,7 @@ def test_simple(self): 'url': 'http://example.com/file.csv', 'title': 'Some data', 'format': '', - } + } dataset = ckan_factories.Dataset(resources=[resource]) resource = model.Resource.get(dataset['resources'][0]['id']) @@ -359,7 +359,7 @@ def test_simple(self): 'url': 'http://example.com/file.csv', 'title': 'Some data', 'format': '', - } + } dataset = ckan_factories.Dataset(resources=[resource]) resource = model.Resource.get(dataset['resources'][0]['id']) From c9a605b116d490b3039292cb8e28f9e7eb301570 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 14:05:24 +1000 Subject: [PATCH 10/28] update requirements - patch xlrd - don't force versions of ckanext-archiver and ckanext-report, just verify they're present --- requirements.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 70da4e40..0c9c70c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,10 @@ -xlrd==1.0.0 -python-magic==0.4.12 +xlrd==1.1.0 +#python-magic==0.4.15 #in ckancore messytables==0.15.2 progressbar==2.3 +#SQLAlchemy>=0.6.6 #in ckancore +#requests==2.11.1 #in ckancore +six>=1.0.0 #in ckancore + +ckanext-archiver +ckanext-report From ab0889a7f5d317639fc6b59b38f302fdd90185c1 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 11 Feb 2021 15:49:14 +1000 Subject: [PATCH 11/28] [QOL-6491] fix HTTPError syntax --- ckanext/qa/tasks.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index a9a857e8..1142e6ea 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -442,9 +442,8 @@ def _download_url(url): tmp_file.close() os.remove(tmp_file.name) raise requests.exceptions.HTTPError( - "Received a bad HTTP response when trying to download the data file", - status_code=error.response.status_code, - request_url=url, response=error) + url, error.response.status_code, + "Received a bad HTTP response when trying to download the data file") except requests.exceptions.Timeout: log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) tmp_file.close() @@ -459,9 +458,7 @@ def _download_url(url): log.warning('URL error: {}'.format(err_message)) tmp_file.close() os.remove(tmp_file.name) - raise requests.exceptions.HTTPError( - message=err_message, status_code=None, - request_url=url, response=None) + raise requests.exceptions.HTTPError(url, None, err_message) log.info('Downloaded ok - %s', printable_file_size(length)) tmp_file.seek(0) From ea73fb8618929348d1344d8201789b093f3ed040 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Thu, 11 Feb 2021 16:07:33 +1000 Subject: [PATCH 12/28] [QOL-6491] fix HTTPError argument order --- ckanext/qa/tasks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py index 1142e6ea..c33d612e 100644 --- a/ckanext/qa/tasks.py +++ b/ckanext/qa/tasks.py @@ -442,8 +442,9 @@ def _download_url(url): tmp_file.close() os.remove(tmp_file.name) raise requests.exceptions.HTTPError( - url, error.response.status_code, - "Received a bad HTTP response when trying to download the data file") + error.response.status_code, + "Received a bad HTTP response when trying to download the data file", + url) except requests.exceptions.Timeout: log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT)) tmp_file.close() @@ -458,7 +459,7 @@ def _download_url(url): log.warning('URL error: {}'.format(err_message)) tmp_file.close() os.remove(tmp_file.name) - raise requests.exceptions.HTTPError(url, None, err_message) + raise requests.exceptions.HTTPError(None, err_message, url) log.info('Downloaded ok - %s', printable_file_size(length)) tmp_file.seek(0) From b19afa4cdfa1ac98d55891e55c2100f21725ccd5 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 12 Feb 2021 09:04:32 +1000 Subject: [PATCH 13/28] [QOL-6491] oops fix six field name --- ckanext/qa/sniff_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py index 13b4dc6e..e1d5869a 100644 --- a/ckanext/qa/sniff_format.py +++ b/ckanext/qa/sniff_format.py @@ -36,7 +36,7 @@ def sniff_file_format(filepath): ''' format_ = None log.info('Sniffing file format of: %s', filepath) - filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, six.text_types) \ + filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, six.string_types) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) From cf9bc028b97975dc255998cdf6e19b8e3bef09bc Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 12 Feb 2021 12:27:27 +1000 Subject: [PATCH 14/28] sync mock server with the one from ckanext-archiver, including fixing text vs binary typing --- ckanext/qa/tests/mock_remote_server.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/ckanext/qa/tests/mock_remote_server.py b/ckanext/qa/tests/mock_remote_server.py index 9c59ed5f..3761a90d 100644 --- a/ckanext/qa/tests/mock_remote_server.py +++ b/ckanext/qa/tests/mock_remote_server.py @@ -97,7 +97,8 @@ class MockEchoTestServer(MockHTTPServer): a 500 error response: 'http://localhost/?status=500' a 200 OK response, returning the function's docstring: - 'http://localhost/?status=200;content-type=text/plain;content_var=ckan.tests.lib.test_package_search:test_wsgi_app.__doc__' + 'http://localhost/?status=200;content-type=text/plain;content_var + =ckan.tests.lib.test_package_search:test_wsgi_app.__doc__' To specify content, use: @@ -114,10 +115,16 @@ def __call__(self, environ, start_response): if 'content_var' in request.str_params: content = request.str_params.get('content_var') content = self.get_content(content) + elif 'content_long' in request.str_params: + content = '*' * 1000001 else: content = request.str_params.get('content', '') + if 'method' in request.str_params \ + and request.method.lower() != request.str_params['method'].lower(): + content = '' + status = 405 - if isinstance(content, six.string_types): + if isinstance(content, six.text_type): raise TypeError("Expected raw byte string for content") headers = [ @@ -125,8 +132,11 @@ def __call__(self, environ, start_response): for item in request.str_params.items() if item[0] not in ('content', 'status') ] - if content: - headers += [('Content-Length', str(len(content)))] + if 'length' in request.str_params: + cl = request.str_params.get('length') + headers += [('Content-Length', cl)] + elif content and 'no-content-length' not in request.str_params: + headers += [('Content-Length', six.binary_type(len(content)))] start_response( '%d %s' % (status, responses[status]), headers From 1454f6e986ebef8bf82b2043f93c9472127d7a6d Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 12 Feb 2021 13:02:35 +1000 Subject: [PATCH 15/28] improve assertion error message --- ckanext/qa/tests/test_sniff_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py index c3b7f279..beb7b7b5 100644 --- a/ckanext/qa/tests/test_sniff_format.py +++ b/ckanext/qa/tests/test_sniff_format.py @@ -30,7 +30,7 @@ def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath): '''Given a filepath, checks the sniffed format matches the format_extension.''' expected_format = format_extension sniffed_format = sniff_file_format(filepath) - assert sniffed_format, expected_format + assert sniffed_format, "Expected {} but failed to sniff any format: {}".format(expected_format, sniffed_format) expected_format_without_zip = expected_format.replace('.zip', '') assert_equal(sniffed_format['format'].lower(), expected_format_without_zip) From 3ad03ff59915f7c15ea86a57c54bf38fda76088c Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 12 Feb 2021 13:27:45 +1000 Subject: [PATCH 16/28] improve error messages for testing file sniffing --- ckanext/qa/tests/test_sniff_format.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py index beb7b7b5..2d1a35de 100644 --- a/ckanext/qa/tests/test_sniff_format.py +++ b/ckanext/qa/tests/test_sniff_format.py @@ -30,7 +30,7 @@ def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath): '''Given a filepath, checks the sniffed format matches the format_extension.''' expected_format = format_extension sniffed_format = sniff_file_format(filepath) - assert sniffed_format, "Expected {} but failed to sniff any format: {}".format(expected_format, sniffed_format) + assert sniffed_format, "Expected {} but failed to sniff any format for file: {}".format(expected_format, filepath) expected_format_without_zip = expected_format.replace('.zip', '') assert_equal(sniffed_format['format'].lower(), expected_format_without_zip) @@ -49,16 +49,11 @@ def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath): def check_format(cls, format, filename=None): for format_extension, filepath in cls.fixture_files: if format_extension == format: - if filename: - if filename in filepath: - break - else: - continue - else: - break + if not filename or filename in filepath: + cls.assert_file_has_format_sniffed_correctly(format_extension, filepath) + break else: assert 0, format # Could not find fixture for format - cls.assert_file_has_format_sniffed_correctly(format_extension, filepath) def test_xls(self): self.check_format('xls', '10-p108-data-results') From 0773ee70dde1d9189a92c8df42b09fa18b25f9ad Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 12 Feb 2021 13:40:53 +1000 Subject: [PATCH 17/28] oops fix loop termination logic --- ckanext/qa/tests/test_sniff_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py index 2d1a35de..86039bc8 100644 --- a/ckanext/qa/tests/test_sniff_format.py +++ b/ckanext/qa/tests/test_sniff_format.py @@ -51,7 +51,7 @@ def check_format(cls, format, filename=None): if format_extension == format: if not filename or filename in filepath: cls.assert_file_has_format_sniffed_correctly(format_extension, filepath) - break + break else: assert 0, format # Could not find fixture for format From 7533f13ef1eafac353dbd11f3ab1bcb7f3cbec0d Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 12 Feb 2021 14:53:46 +1000 Subject: [PATCH 18/28] use BSD 'file' fallback more consistently when other methods fail --- ckanext/qa/sniff_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py index e1d5869a..2a3b8a20 100644 --- a/ckanext/qa/sniff_format.py +++ b/ckanext/qa/sniff_format.py @@ -124,7 +124,7 @@ def sniff_file_format(filepath): if has_rdfa(buf): format_ = {'format': 'RDFa'} - else: + if not format_: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath): format_ = {'format': 'XLS'} From 7fcf5096d1e0e1b6673797457e3179a6ba126453 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 14:14:37 +1000 Subject: [PATCH 19/28] move requirements to a separate file instead of setup.py --- setup.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index f6012055..779aed3d 100644 --- a/setup.py +++ b/setup.py @@ -17,20 +17,16 @@ include_package_data=True, zip_safe=False, install_requires=[ - 'ckanext-archiver>=2.0', - 'ckanext-report', - 'SQLAlchemy>=0.6.6', - 'requests', - 'xlrd>=0.8.0', - 'messytables>=0.8', - 'python-magic>=0.4', - 'progressbar', - 'six>=1.9' # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301 + # CKAN extensions should not list dependencies here, but in a separate + # ``requirements.txt`` file. + # + # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt ], tests_require=[ - 'nose', - 'mock', - 'flask' + # CKAN extensions should not list dependencies here, but in a separate + # ``dev-requirements.txt`` file. + # + # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt ], entry_points=''' [paste.paster_command] From 5c753fbb241697c222b67d7bcc7d56d6f1b0e944 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 14:25:00 +1000 Subject: [PATCH 20/28] update version number - new minor version, because it introduces the download proxy and retrieving remote cached files as new features --- ckanext/qa/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/qa/__init__.py b/ckanext/qa/__init__.py index 53fd0507..21f26a28 100644 --- a/ckanext/qa/__init__.py +++ b/ckanext/qa/__init__.py @@ -6,4 +6,4 @@ import pkgutil __path__ = pkgutil.extend_path(__path__, __name__) -__version__ = '2.0' +__version__ = '2.1.0-rc1' From 6ca9dc4a518648e121f86278d14015a222f08f4d Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 14:41:46 +1000 Subject: [PATCH 21/28] fix Travis build scripts - install newer setuptools if needed - install transitive dependencies of ckanext-archiver and ckanext-report - start to recognise Python 3 --- bin/travis-build.bash | 48 ++++++++++++++++++++++++++++++------------- bin/travis-run.sh | 2 +- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/bin/travis-build.bash b/bin/travis-build.bash index fa1b072c..284ebb82 100644 --- a/bin/travis-build.bash +++ b/bin/travis-build.bash @@ -6,7 +6,16 @@ echo "This is travis-build.bash..." echo "Installing the packages that CKAN requires..." sudo apt-get update -qq -sudo apt-get install solr-jetty libcommons-fileupload-java +sudo apt-get install -y solr-jetty libcommons-fileupload-java + +ver=$(python -c"import sys; print(sys.version_info.major)") +if [ $ver -eq 2 ]; then + echo "python version 2" +elif [ $ver -eq 3 ]; then + echo "python version 3" +else + echo "Unknown python version: $ver" +fi echo "Upgrading libmagic for ckanext-qa..." # appears to upgrade it from 5.09-2 to 5.09-2ubuntu0.6 which seems to help the tests @@ -16,6 +25,10 @@ echo "Installing CKAN and its Python dependencies..." git clone https://github.com/ckan/ckan cd ckan +if [ $ver -eq 3 ]; then + pip install -r requirement-setuptools.txt +fi + if [ $CKANVERSION == 'master' ] then echo "CKAN version: master" @@ -25,14 +38,14 @@ else echo "CKAN version: ${CKAN_TAG#ckan-}" fi -python setup.py develop -if [ -f requirements-py2.txt ] +if [ -f requirements-py2.txt ] && [ $ver -eq 2 ] then pip install -r requirements-py2.txt else pip install -r requirements.txt fi pip install -r dev-requirements.txt --allow-all-external +python setup.py develop cd - echo "Setting up Solr..." @@ -54,22 +67,29 @@ paster db init -c test-core.ini cd - echo "Installing dependency ckanext-report and its requirements..." -pip install -e git+https://github.com/datagovuk/ckanext-report.git#egg=ckanext-report +git clone --depth=50 https://github.com/datagovuk/ckanext-report.git +cd ckanext-report + if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then + pip install -r requirements-py2.txt + elif [ -f requirements.txt ]; then + pip install -r requirements.txt + fi + pip install --no-deps -e . +cd - echo "Installing dependency ckanext-archiver and its requirements..." -git clone https://github.com/ckan/ckanext-archiver.git +git clone --depth=50 https://github.com/ckan/ckanext-archiver.git cd ckanext-archiver -pip install -e . -pip install -r requirements.txt + if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then + pip install -r requirements-py2.txt + elif [ -f requirements.txt ]; then + pip install -r requirements.txt + fi + pip install --no-deps -e . cd - -echo "Installing ckanext-qa and its requirements..." -python setup.py develop -pip install -r requirements.txt -pip install -r dev-requirements.txt - echo "Moving test-core.ini into a subdir..." -mkdir subdir -mv test-core.ini subdir +mkdir -p subdir +cp test-core.ini subdir echo "travis-build.bash is done." diff --git a/bin/travis-run.sh b/bin/travis-run.sh index 5c4022b7..1a6e7ef3 100644 --- a/bin/travis-run.sh +++ b/bin/travis-run.sh @@ -3,4 +3,4 @@ echo "NO_START=0\nJETTY_HOST=127.0.0.1\nJETTY_PORT=8983\nJAVA_HOME=$JAVA_HOME" | sudo tee /etc/default/jetty sudo cp ckan/ckan/config/solr/schema.xml /etc/solr/conf/schema.xml sudo service jetty restart -nosetests --with-pylons=subdir/test-core.ini --with-coverage --cover-package=ckanext.archiver --cover-inclusive --cover-erase --cover-tests +nosetests --with-pylons=subdir/test-core.ini --with-coverage --cover-package=ckanext.qa --cover-inclusive --cover-erase --cover-tests From e65cb4f72a0cae3479efdabf3c81cd7e6c9b4725 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 15:06:00 +1000 Subject: [PATCH 22/28] fix condition on updating setuptools for TravisCI --- bin/travis-build.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/travis-build.bash b/bin/travis-build.bash index 284ebb82..4687ec49 100644 --- a/bin/travis-build.bash +++ b/bin/travis-build.bash @@ -25,7 +25,7 @@ echo "Installing CKAN and its Python dependencies..." git clone https://github.com/ckan/ckan cd ckan -if [ $ver -eq 3 ]; then +if [ -f requirement-setuptools.txt ]; then pip install -r requirement-setuptools.txt fi From b2a0c23a90dea9f0ff40a20b9fbbf4d105bcb569 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 15:15:55 +1000 Subject: [PATCH 23/28] fix paster reference to ckan for testing --- bin/travis-build.bash | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bin/travis-build.bash b/bin/travis-build.bash index 4687ec49..1a57798e 100644 --- a/bin/travis-build.bash +++ b/bin/travis-build.bash @@ -62,9 +62,7 @@ sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';" sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;' echo "Initialising the database..." -cd ckan -paster db init -c test-core.ini -cd - +paster --plugin=ckan db init -c test-core.ini echo "Installing dependency ckanext-report and its requirements..." git clone --depth=50 https://github.com/datagovuk/ckanext-report.git @@ -88,6 +86,11 @@ cd ckanext-archiver pip install --no-deps -e . cd - +echo "Installing ckanext-qa and its requirements..." +pip install -r requirements.txt +pip install -r dev-requirements.txt +python setup.py develop + echo "Moving test-core.ini into a subdir..." mkdir -p subdir cp test-core.ini subdir From 511b2d2e7ea088bf610d9ae179325d78dd01675e Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 15:30:53 +1000 Subject: [PATCH 24/28] move DB operations together in TravisCI --- bin/travis-build.bash | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/bin/travis-build.bash b/bin/travis-build.bash index 1a57798e..d44eee00 100644 --- a/bin/travis-build.bash +++ b/bin/travis-build.bash @@ -23,7 +23,7 @@ sudo apt-get install libmagic1 echo "Installing CKAN and its Python dependencies..." git clone https://github.com/ckan/ckan -cd ckan +pushd ckan if [ -f requirement-setuptools.txt ]; then pip install -r requirement-setuptools.txt @@ -46,7 +46,15 @@ else fi pip install -r dev-requirements.txt --allow-all-external python setup.py develop -cd - + +echo "Creating the PostgreSQL user and database..." +sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';" +sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;' + +echo "Initialising the database..." +paster db init -c test-core.ini + +popd echo "Setting up Solr..." # solr is multicore for tests on ckan master now, but it's easier to run tests @@ -57,34 +65,27 @@ printf "NO_START=0\nJETTY_HOST=127.0.0.1\nJETTY_PORT=8983\nJAVA_HOME=$JAVA_HOME" sudo cp ckan/ckan/config/solr/schema.xml /etc/solr/conf/schema.xml sudo service jetty restart -echo "Creating the PostgreSQL user and database..." -sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';" -sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;' - -echo "Initialising the database..." -paster --plugin=ckan db init -c test-core.ini - echo "Installing dependency ckanext-report and its requirements..." git clone --depth=50 https://github.com/datagovuk/ckanext-report.git -cd ckanext-report +pushd ckanext-report if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then pip install -r requirements-py2.txt elif [ -f requirements.txt ]; then pip install -r requirements.txt fi pip install --no-deps -e . -cd - +popd echo "Installing dependency ckanext-archiver and its requirements..." git clone --depth=50 https://github.com/ckan/ckanext-archiver.git -cd ckanext-archiver +pushd ckanext-archiver if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then pip install -r requirements-py2.txt elif [ -f requirements.txt ]; then pip install -r requirements.txt fi pip install --no-deps -e . -cd - +popd echo "Installing ckanext-qa and its requirements..." pip install -r requirements.txt From e187a1d36f8638654f059941f9ee02c2dfab06f7 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 15:49:00 +1000 Subject: [PATCH 25/28] use 'ckan' instead of 'paster' for CKAN 2.9+ --- bin/travis-build.bash | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bin/travis-build.bash b/bin/travis-build.bash index d44eee00..b6a21d86 100644 --- a/bin/travis-build.bash +++ b/bin/travis-build.bash @@ -32,13 +32,15 @@ fi if [ $CKANVERSION == 'master' ] then echo "CKAN version: master" + export CKAN_MINOR_VERSION=100 else + export CKAN_MINOR_VERSION=${CKANVERSION##*.} CKAN_TAG=$(git tag | grep ^ckan-$CKANVERSION | sort --version-sort | tail -n 1) git checkout $CKAN_TAG echo "CKAN version: ${CKAN_TAG#ckan-}" fi -if [ -f requirements-py2.txt ] && [ $ver -eq 2 ] +if (( $CKAN_MINOR_VERSION >= 9 )) && (( $ver = 2 )) then pip install -r requirements-py2.txt else @@ -52,7 +54,12 @@ sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';" sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;' echo "Initialising the database..." -paster db init -c test-core.ini +if (( $CKAN_MINOR_VERSION >= 9 )) +then + ckan -c test-core.ini db init +else + paster db init -c test-core.ini +fi popd From 686a71dad3cf5a9bd6af0e37034b3f98ea795776 Mon Sep 17 00:00:00 2001 From: antuarc Date: Fri, 5 Mar 2021 15:57:02 +1000 Subject: [PATCH 26/28] use 'master' if CKANVERSION is unspecified --- bin/travis-build.bash | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/travis-build.bash b/bin/travis-build.bash index b6a21d86..1805f7ab 100644 --- a/bin/travis-build.bash +++ b/bin/travis-build.bash @@ -29,7 +29,7 @@ if [ -f requirement-setuptools.txt ]; then pip install -r requirement-setuptools.txt fi -if [ $CKANVERSION == 'master' ] +if [ ${CKANVERSION:-master} == 'master' ] then echo "CKAN version: master" export CKAN_MINOR_VERSION=100 @@ -40,7 +40,7 @@ else echo "CKAN version: ${CKAN_TAG#ckan-}" fi -if (( $CKAN_MINOR_VERSION >= 9 )) && (( $ver = 2 )) +if (( "$CKAN_MINOR_VERSION" >= 9 )) && (( $ver = 2 )) then pip install -r requirements-py2.txt else From ab60ac8bacd63c4baef674ae03fe5527b0954124 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 8 Mar 2021 13:08:13 +1000 Subject: [PATCH 27/28] add draft Github Actions workflow - needs a bit more debugging but is close to working --- .github/workflows/test.yml | 123 +++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..2df30efb --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,123 @@ +--- +#based on https://raw.githubusercontent.com/ckan/ckanext-scheming/master/.github/workflows/test.yml +# alternative https://github.com/ckan/ckan/blob/master/contrib/cookiecutter/ckan_extension/%7B%7Bcookiecutter.project%7D%7D/.github/workflows/test.yml +name: Tests +on: [push, pull_request] +env: + CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test + CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test + CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test + CKAN_SOLR_URL: http://solr:8983/solr/ckan + CKAN_REDIS_URL: redis://redis:6379/1 +jobs: + + + + + lint: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.6' + - name: Install requirements + run: pip install flake8 pycodestyle + - name: Check syntax + run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan + + test: + needs: lint + strategy: + matrix: + # not ready for CKAN 2.9 yet + # ckan-version: [2.9, 2.9-py2, 2.8, 2.7] + ckan-version: [2.8, 2.7] + env: + - { ARCHIVER_GIT_REPO: "ckan", ARCHIVER_BRANCH: "master", REPORT_GIT_REPO: "datagovuk", REPORT_BRANCH: "master" } + - { ARCHIVER_GIT_REPO: "qld-gov-au", ARCHIVER_BRANCH: "2.1.0-qgov.1", REPORT_GIT_REPO: "qld-gov-au", REPORT_BRANCH: "0.1" } + - { ARCHIVER_GIT_REPO: "qld-gov-au", ARCHIVER_BRANCH: "develop", REPORT_GIT_REPO: "qld-gov-au", REPORT_BRANCH: "develop" } + fail-fast: false + + name: CKAN ${{ matrix.ckan-version }} + runs-on: ubuntu-18.04 + container: + image: openknowledge/ckan-dev:${{ matrix.ckan-version }} + services: + solr: + image: ckan/ckan-solr-dev:${{ matrix.ckan-version }} + postgres: + image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }} + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis:3 + env: ${{ matrix.env }} + + steps: + - uses: actions/checkout@v2 + + - name: Install report and archiver plugins + run: | + echo "Installing dependency ckanext-report and its requirements..." + if [ ! -d ckanext-report ]; then + git clone --depth=50 --branch=$REPORT_BRANCH https://github.com/$REPORT_GIT_REPO/ckanext-report ckanext-report + fi + cd ckanext-report + if [ -f pip-requirements.txt ]; then + pip install -r pip-requirements.txt + fi + if [ -f dev-requirements.txt ]; then + pip install -r dev-requirements.txt + fi + + if [ -f requirements.txt ]; then + pip install -r requirements.txt + fi + pip install --no-deps -e . + cd .. + + echo "Installing dependency ckanext-archiver and its requirements..." + if [ ! -d ckanext-archiver ]; then + git clone --depth=50 --branch=$ARCHIVER_BRANCH https://github.com/$ARCHIVER_GIT_REPO/ckanext-archiver ckanext-archiver + fi + cd ckanext-archiver + if [ -f pip-requirements.txt ]; then + pip install -r pip-requirements.txt + fi + if [ -f dev-requirements.txt ]; then + pip install -r dev-requirements.txt + fi + + if [ -f requirements.txt ]; then + pip install -r requirements.txt + fi + pip install --no-deps -e . + cd .. + + - name: Install requirements + run: | + pip install -r dev-requirements.txt + pip install -r pip-requirements.txt + pip install -r requirements.txt + pip install -e . + apk add file + # Replace default path to CKAN core config file with the one on the container + sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini + + - name: Setup extension (CKAN >= 2.9) + if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' }} + run: | + ckan -c test.ini db init + - name: Setup extension (CKAN < 2.9) + if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' }} + run: | + paster --plugin=ckan db init -c test.ini + - name: Run all tests + run: | + nosetests --with-pylons=test.ini --with-coverage --cover-package=ckanext.qa --cover-inclusive --cover-erase --cover-tests + + From a0a4ba3467903297f90e353dbe2bf43731515539 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 8 Mar 2021 13:20:36 +1000 Subject: [PATCH 28/28] only install from pip-requirements file if present --- .github/workflows/test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2df30efb..311db9ab 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -101,7 +101,9 @@ jobs: - name: Install requirements run: | pip install -r dev-requirements.txt - pip install -r pip-requirements.txt + if [ -f pip-requirements.txt ]; then + pip install -r pip-requirements.txt + fi pip install -r requirements.txt pip install -e . apk add file