From ee87f714a6bbec5e39ad7e00d871bedbb9d8cb45 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 23 Jul 2020 14:48:59 +1000
Subject: [PATCH 01/28] [DQL2-6] download archive to a temporary file so we can
 do type sniffing

- This uses the external URL without an API key, so it will not work on private datasets,
but it is otherwise fairly reliable.
---
 ckanext/qa/tasks.py | 179 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 152 insertions(+), 27 deletions(-)

diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index 26fe8289..38873591 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -4,15 +4,23 @@
 '''
 import datetime
 import json
+import math
 import os
+import tempfile
+import time
 import traceback
 import urlparse
 import routes
 
-from ckan.common import _
+import requests
 
+from ckan.common import _
 from ckan.lib import i18n
 from ckan.plugins import toolkit
+try:
+    from ckan.plugins.toolkit import config
+except ImportError:
+    from pylons import config
 import ckan.lib.helpers as ckan_helpers
 from sniff_format import sniff_file_format
 import lib
@@ -22,6 +30,11 @@
 
 log = logging.getLogger(__name__)
 
+SSL_VERIFY = True
+MAX_CONTENT_LENGTH = int(config.get('ckanext.qa.max_content_length', 1e7))
+CHUNK_SIZE = 16 * 1024  # 16kb
+DOWNLOAD_TIMEOUT = 30
+
 if toolkit.check_ckan_version(max_version='2.6.99'):
     from ckan.lib import celery_app
 
@@ -352,35 +365,147 @@ def score_by_sniffing_data(archival, resource, score_reasons):
         return (None, None)
     # Analyse the cached file
     filepath = archival.cache_filepath
+    delete_file = False
     if not os.path.exists(filepath):
-        score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath)
-        return (None, None)
-    else:
-        if filepath:
-            sniffed_format = sniff_file_format(filepath)
-            score = lib.resource_format_scores().get(sniffed_format['format']) \
-                if sniffed_format else None
-            if sniffed_format:
-                score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.')
-                                     % (sniffed_format['format'], score))
-                return score, sniffed_format['format']
-            else:
-                score_reasons.append(_('The format of the file was not recognized from its contents.'))
+        log.debug("File not found on disk for resource %s", resource)
+        if resource.url_type == 'upload':
+            try:
+                resource_dict = toolkit.get_action('resource_show')(None, {'id': resource.id})
+                filepath = _download_url(resource_dict['url']).name
+                delete_file = True
+            except Exception as e:
+                score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e)
                 return (None, None)
         else:
-            # No cache_url
-            if archival.status_id == Status.by_text('Chose not to download'):
-                score_reasons.append(_('File was not downloaded deliberately') + '. '
-                                     + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
-                return (None, None)
-            elif archival.is_broken is None and archival.status_id:
-                # i.e. 'Download failure' or 'System error during archival'
-                score_reasons.append(_('A system error occurred during downloading this file') + '. '
-                                     + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
-                return (None, None)
-            else:
-                score_reasons.append(_('This file had not been downloaded at the time of scoring it.'))
-                return (None, None)
+            score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath)
+            return (None, None)
+    if filepath:
+        sniffed_format = sniff_file_format(filepath)
+        if delete_file:
+            try:
+                os.remove(filepath)
+            except OSError as e:
+                log.warn("Unable to remove temporary file %s: %s", filepath, e)
+        score = lib.resource_format_scores().get(sniffed_format['format']) \
+            if sniffed_format else None
+        if sniffed_format:
+            score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.')
+                                 % (sniffed_format['format'], score))
+            return score, sniffed_format['format']
+        else:
+            score_reasons.append(_('The format of the file was not recognized from its contents.'))
+            return (None, None)
+    else:
+        # No cache_url
+        if archival.status_id == Status.by_text('Chose not to download'):
+            score_reasons.append(_('File was not downloaded deliberately') + '. '
+                                 + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
+            return (None, None)
+        elif archival.is_broken is None and archival.status_id:
+            # i.e. 'Download failure' or 'System error during archival'
+            score_reasons.append(_('A system error occurred during downloading this file') + '. '
+                                 + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
+            return (None, None)
+        else:
+            score_reasons.append(_('This file had not been downloaded at the time of scoring it.'))
+            return (None, None)
+
+
+def _download_url(url):
+    # check scheme
+    scheme = urlparse.urlsplit(url).scheme
+    if scheme not in ('http', 'https', 'ftp'):
+        raise IOError(
+            'Only http, https, and ftp resources may be fetched.'
+        )
+
+    # fetch the resource data
+    log.info('Fetching from: {0}'.format(url))
+    tmp_file = get_tmp_file(url)
+    length = 0
+    cl = None
+    try:
+        headers = {}
+        response = get_response(url, headers)
+
+        # download the file to a tempfile on disk
+        for chunk in response.iter_content(CHUNK_SIZE):
+            length += len(chunk)
+            if length > MAX_CONTENT_LENGTH:
+                log.warn("File size exceeds length limit %s, truncating", MAX_CONTENT_LENGTH)
+                break
+            tmp_file.write(chunk)
+
+    except requests.exceptions.HTTPError as error:
+        # status code error
+        log.debug('HTTP error: {}'.format(error))
+        tmp_file.close()
+        os.remove(tmp_file.name)
+        raise HTTPError(
+            "Received a bad HTTP response when trying to download "
+            "the data file", status_code=error.response.status_code,
+            request_url=url, response=error)
+    except requests.exceptions.Timeout:
+        log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
+        tmp_file.close()
+        os.remove(tmp_file.name)
+        raise IOError('Connection timed out after {}s'.format(
+                       DOWNLOAD_TIMEOUT))
+    except requests.exceptions.RequestException as e:
+        try:
+            err_message = str(e.reason)
+        except AttributeError:
+            err_message = str(e)
+        log.warning('URL error: {}'.format(err_message))
+        tmp_file.close()
+        os.remove(tmp_file.name)
+        raise HTTPError(
+            message=err_message, status_code=None,
+            request_url=url, response=None)
+
+    log.info('Downloaded ok - %s', printable_file_size(length))
+    tmp_file.seek(0)
+    return tmp_file
+
+
+def get_response(url, headers):
+    def get_url():
+        return requests.get(
+            url,
+            headers=headers,
+            timeout=DOWNLOAD_TIMEOUT,
+            verify=SSL_VERIFY,
+            stream=True,  # just gets the headers for now
+        )
+    response = get_url()
+    if response.status_code == 202:
+        # Seen: https://data-cdfw.opendata.arcgis.com/datasets
+        # In this case it means it's still processing, so do retries.
+        # 202 can mean other things, but there's no harm in retries.
+        wait = 1
+        while wait < 120 and response.status_code == 202:
+            # log.info('Retrying after {}s'.format(wait))
+            time.sleep(wait)
+            response = get_url()
+            wait *= 3
+    response.raise_for_status()
+    return response
+
+
+def get_tmp_file(url):
+    filename = url.split('/')[-1].split('#')[0].split('?')[0]
+    tmp_file = tempfile.NamedTemporaryFile(suffix=filename, delete=False)
+    return tmp_file
+
+
+def printable_file_size(size_bytes):
+    if size_bytes == 0:
+        return '0 bytes'
+    size_name = ('bytes', 'KB', 'MB', 'GB', 'TB')
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 1)
+    return "%s %s" % (s, size_name[i])
 
 
 def score_by_url_extension(resource, score_reasons):

From a3a3e401b68df4b71f843c1dd52dde0e428159e2 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 23 Jul 2020 16:06:56 +1000
Subject: [PATCH 02/28] [DQL2-6] use archival URL when not on disk, regardless
 of whether it's an upload

- This appears to point to either the download URL, or the cache URL, as needed
---
 ckanext/qa/tasks.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index 38873591..b3ba6d6b 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -367,18 +367,15 @@ def score_by_sniffing_data(archival, resource, score_reasons):
     filepath = archival.cache_filepath
     delete_file = False
     if not os.path.exists(filepath):
-        log.debug("File not found on disk for resource %s", resource)
-        if resource.url_type == 'upload':
-            try:
-                resource_dict = toolkit.get_action('resource_show')(None, {'id': resource.id})
-                filepath = _download_url(resource_dict['url']).name
-                delete_file = True
-            except Exception as e:
-                score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e)
-                return (None, None)
-        else:
-            score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath)
+        log.debug("%s not found on disk, retrieving from URL %s",
+                  filepath, archival.cache_url)
+        try:
+            filepath = _download_url(archival.cache_url).name
+            delete_file = True
+        except Exception as e:
+            score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e)
             return (None, None)
+
     if filepath:
         sniffed_format = sniff_file_format(filepath)
         if delete_file:

From 9e4e47ee523f1d2e7aa26dea02e0e9eccf37064d Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 23 Jul 2020 16:19:02 +1000
Subject: [PATCH 03/28] [DQL2-6] ensure that we try to clean up the temporary
 file even on error

---
 ckanext/qa/tasks.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index b3ba6d6b..f7515db0 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -377,12 +377,14 @@ def score_by_sniffing_data(archival, resource, score_reasons):
             return (None, None)
 
     if filepath:
-        sniffed_format = sniff_file_format(filepath)
-        if delete_file:
-            try:
-                os.remove(filepath)
-            except OSError as e:
-                log.warn("Unable to remove temporary file %s: %s", filepath, e)
+        try:
+            sniffed_format = sniff_file_format(filepath)
+        finally:
+            if delete_file:
+                try:
+                    os.remove(filepath)
+                except OSError as e:
+                    log.warn("Unable to remove temporary file %s: %s", filepath, e)
         score = lib.resource_format_scores().get(sniffed_format['format']) \
             if sniffed_format else None
         if sniffed_format:

From d02993ccad8a4748604a514be072de97df0fb4dd Mon Sep 17 00:00:00 2001
From: william dutton <will.dutt@gmail.com>
Date: Thu, 30 Jul 2020 11:34:22 +1000
Subject: [PATCH 04/28] Use utc timezone so storage and helper functions work
 correctly

---
 ckanext/qa/bin/migrate_task_status.py | 3 ++-
 ckanext/qa/bin/running_stats.py       | 6 ++++--
 ckanext/qa/model.py                   | 5 +++--
 ckanext/qa/tasks.py                   | 4 +++-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py
index f57b1bf5..71a254f7 100644
--- a/ckanext/qa/bin/migrate_task_status.py
+++ b/ckanext/qa/bin/migrate_task_status.py
@@ -12,6 +12,7 @@
 import datetime
 
 import common
+import pytz
 from running_stats import StatsList
 
 # pip install 'ProgressBar==2.3'
@@ -19,7 +20,7 @@
 
 START_OF_TIME = datetime.datetime(1980, 1, 1)
 END_OF_TIME = datetime.datetime(9999, 12, 31)
-TODAY = datetime.datetime.now()
+TODAY = datetime.datetime.now(tzinfo=pytz.utc)
 
 # NB put no CKAN imports here, or logging breaks
 
diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py
index 947797aa..53abd07c 100644
--- a/ckanext/qa/bin/running_stats.py
+++ b/ckanext/qa/bin/running_stats.py
@@ -35,6 +35,8 @@
 import copy
 import datetime
 
+import pytz
+
 
 class StatsCount(dict):
     # {category:count}
@@ -42,7 +44,7 @@ class StatsCount(dict):
     report_value_limit = 150
 
     def __init__(self, *args, **kwargs):
-        self._start_time = datetime.datetime.now()
+        self._start_time = datetime.datetime.now(tzinfo=pytz.utc)
         super(StatsCount, self).__init__(*args, **kwargs)
 
     def _init_category(self, category):
@@ -80,7 +82,7 @@ def report(self, indent=1, order_by_title=False, show_time_taken=True):
             lines = [indent_str + 'None']
 
         if show_time_taken:
-            time_taken = datetime.datetime.now() - self._start_time
+            time_taken = datetime.datetime.now(tzinfo=pytz.utc) - self._start_time
             lines.append(indent_str + 'Time taken (h:m:s): %s' % time_taken)
         return '\n'.join(lines)
 
diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py
index 9e6b97a1..4fc4a801 100644
--- a/ckanext/qa/model.py
+++ b/ckanext/qa/model.py
@@ -1,6 +1,7 @@
 import uuid
 import datetime
 
+import pytz
 from sqlalchemy import Column
 from sqlalchemy import types
 from sqlalchemy.ext.declarative import declarative_base
@@ -35,8 +36,8 @@ class QA(Base):
     openness_score_reason = Column(types.UnicodeText)
     format = Column(types.UnicodeText)
 
-    created = Column(types.DateTime, default=datetime.datetime.now)
-    updated = Column(types.DateTime, default=datetime.datetime.now)
+    created = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc))
+    updated = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc))
 
     def __repr__(self):
         summary = 'score=%s format=%s' % (self.openness_score, self.format)
diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index f7515db0..3922b740 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -9,6 +9,8 @@
 import tempfile
 import time
 import traceback
+
+import pytz
 import urlparse
 import routes
 
@@ -606,7 +608,7 @@ def save_qa_result(resource, qa_result):
     import ckan.model as model
     from ckanext.qa.model import QA
 
-    now = datetime.datetime.now()
+    now = datetime.datetime.now(tzinfo=pytz.utc)
 
     qa = QA.get_for_resource(resource.id)
     if not qa:

From ad18da232de717f1557e57c9f840252a8de7702c Mon Sep 17 00:00:00 2001
From: william dutton <will.dutt@gmail.com>
Date: Thu, 30 Jul 2020 12:01:46 +1000
Subject: [PATCH 05/28] Use utc timezone so storage and helper functions work
 correctly

---
 ckanext/qa/model.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py
index 4fc4a801..160b0ad4 100644
--- a/ckanext/qa/model.py
+++ b/ckanext/qa/model.py
@@ -1,7 +1,6 @@
 import uuid
 import datetime
 
-import pytz
 from sqlalchemy import Column
 from sqlalchemy import types
 from sqlalchemy.ext.declarative import declarative_base
@@ -36,8 +35,8 @@ class QA(Base):
     openness_score_reason = Column(types.UnicodeText)
     format = Column(types.UnicodeText)
 
-    created = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc))
-    updated = Column(types.DateTime, default=datetime.datetime.now(tzinfo=pytz.utc))
+    created = Column(types.DateTime, default=datetime.datetime.utcnow)
+    updated = Column(types.DateTime, default=datetime.datetime.utcnow)
 
     def __repr__(self):
         summary = 'score=%s format=%s' % (self.openness_score, self.format)

From 3529c722a9c53beaf380f919646803bdbf049c97 Mon Sep 17 00:00:00 2001
From: william dutton <will.dutt@gmail.com>
Date: Thu, 30 Jul 2020 13:19:44 +1000
Subject: [PATCH 06/28] Use utc timezone so storage and helper functions work
 correctly

---
 ckanext/qa/bin/migrate_task_status.py | 2 +-
 ckanext/qa/bin/running_stats.py       | 4 ++--
 ckanext/qa/tasks.py                   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py
index 71a254f7..7b58e83c 100644
--- a/ckanext/qa/bin/migrate_task_status.py
+++ b/ckanext/qa/bin/migrate_task_status.py
@@ -20,7 +20,7 @@
 
 START_OF_TIME = datetime.datetime(1980, 1, 1)
 END_OF_TIME = datetime.datetime(9999, 12, 31)
-TODAY = datetime.datetime.now(tzinfo=pytz.utc)
+TODAY = datetime.datetime.now(tz=pytz.utc)
 
 # NB put no CKAN imports here, or logging breaks
 
diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py
index 53abd07c..c6e6d538 100644
--- a/ckanext/qa/bin/running_stats.py
+++ b/ckanext/qa/bin/running_stats.py
@@ -44,7 +44,7 @@ class StatsCount(dict):
     report_value_limit = 150
 
     def __init__(self, *args, **kwargs):
-        self._start_time = datetime.datetime.now(tzinfo=pytz.utc)
+        self._start_time = datetime.datetime.now(tz=pytz.utc)
         super(StatsCount, self).__init__(*args, **kwargs)
 
     def _init_category(self, category):
@@ -82,7 +82,7 @@ def report(self, indent=1, order_by_title=False, show_time_taken=True):
             lines = [indent_str + 'None']
 
         if show_time_taken:
-            time_taken = datetime.datetime.now(tzinfo=pytz.utc) - self._start_time
+            time_taken = datetime.datetime.now(tz=pytz.utc) - self._start_time
             lines.append(indent_str + 'Time taken (h:m:s): %s' % time_taken)
         return '\n'.join(lines)
 
diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index 3922b740..5118276f 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -608,7 +608,7 @@ def save_qa_result(resource, qa_result):
     import ckan.model as model
     from ckanext.qa.model import QA
 
-    now = datetime.datetime.now(tzinfo=pytz.utc)
+    now = datetime.datetime.now(tz=pytz.utc)
 
     qa = QA.get_for_resource(resource.id)
     if not qa:

From 41c108de9e03c4c79e5ac17a812a4fb0da672c12 Mon Sep 17 00:00:00 2001
From: william dutton <will.dutt@gmail.com>
Date: Thu, 30 Jul 2020 13:36:03 +1000
Subject: [PATCH 07/28] all internal datetimes in ckan are utc

---
 ckanext/qa/bin/migrate_task_status.py | 3 +--
 ckanext/qa/bin/running_stats.py       | 7 ++-----
 ckanext/qa/tasks.py                   | 3 +--
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py
index 7b58e83c..f9c7d59e 100644
--- a/ckanext/qa/bin/migrate_task_status.py
+++ b/ckanext/qa/bin/migrate_task_status.py
@@ -12,7 +12,6 @@
 import datetime
 
 import common
-import pytz
 from running_stats import StatsList
 
 # pip install 'ProgressBar==2.3'
@@ -20,7 +19,7 @@
 
 START_OF_TIME = datetime.datetime(1980, 1, 1)
 END_OF_TIME = datetime.datetime(9999, 12, 31)
-TODAY = datetime.datetime.now(tz=pytz.utc)
+TODAY = datetime.datetime.utcnow()
 
 # NB put no CKAN imports here, or logging breaks
 
diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py
index c6e6d538..f4abe5ba 100644
--- a/ckanext/qa/bin/running_stats.py
+++ b/ckanext/qa/bin/running_stats.py
@@ -35,16 +35,13 @@
 import copy
 import datetime
 
-import pytz
-
-
 class StatsCount(dict):
     # {category:count}
     _init_value = 0
     report_value_limit = 150
 
     def __init__(self, *args, **kwargs):
-        self._start_time = datetime.datetime.now(tz=pytz.utc)
+        self._start_time = datetime.datetime.utcnow()
         super(StatsCount, self).__init__(*args, **kwargs)
 
     def _init_category(self, category):
@@ -82,7 +79,7 @@ def report(self, indent=1, order_by_title=False, show_time_taken=True):
             lines = [indent_str + 'None']
 
         if show_time_taken:
-            time_taken = datetime.datetime.now(tz=pytz.utc) - self._start_time
+            time_taken = datetime.datetime.utcnow() - self._start_time
             lines.append(indent_str + 'Time taken (h:m:s): %s' % time_taken)
         return '\n'.join(lines)
 
diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index 5118276f..538d7b43 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -10,7 +10,6 @@
 import time
 import traceback
 
-import pytz
 import urlparse
 import routes
 
@@ -608,7 +607,7 @@ def save_qa_result(resource, qa_result):
     import ckan.model as model
     from ckanext.qa.model import QA
 
-    now = datetime.datetime.now(tz=pytz.utc)
+    now = datetime.datetime.utcnow()
 
     qa = QA.get_for_resource(resource.id)
     if not qa:

From fefb61d1918b6948e575312bb2416d31df01c7d2 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 11 Feb 2021 12:57:49 +1000
Subject: [PATCH 08/28] [QOL-6491] enable file downloads to go through a proxy
 if needed

---
 ckanext/qa/tasks.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index 538d7b43..d96c954e 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -470,13 +470,12 @@ def _download_url(url):
 
 def get_response(url, headers):
     def get_url():
-        return requests.get(
-            url,
-            headers=headers,
-            timeout=DOWNLOAD_TIMEOUT,
-            verify=SSL_VERIFY,
-            stream=True,  # just gets the headers for now
-        )
+        kwargs = {'headers': headers, 'timeout': DOWNLOAD_TIMEOUT,
+                  'verify': SSL_VERIFY, 'stream': True} # just gets the headers for now
+        if 'ckan.download_proxy' in config:
+            proxy = config.get('ckan.download_proxy')
+            kwargs['proxies'] = {'http': proxy, 'https': proxy}
+        return requests.get(url, **kwargs)
     response = get_url()
     if response.status_code == 202:
         # Seen: https://data-cdfw.opendata.arcgis.com/datasets

From dc4430b978b83e2748146dae38ac8ab87181d6a6 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 11 Feb 2021 14:03:11 +1000
Subject: [PATCH 09/28] [QOL-6491] cleanup

- add Flake8 config and make it pass
- use 'six' to prepare for Python 3
---
 .flake8                                | 20 +++++++++
 ckanext/qa/bin/common.py               |  2 +-
 ckanext/qa/bin/migrate_task_status.py  | 10 ++---
 ckanext/qa/bin/running_stats.py        |  9 ++--
 ckanext/qa/commands.py                 | 57 ++++++++++++-------------
 ckanext/qa/controllers.py              |  2 +-
 ckanext/qa/lib.py                      |  4 +-
 ckanext/qa/logic/action.py             |  2 +-
 ckanext/qa/model.py                    |  5 ++-
 ckanext/qa/plugin.py                   |  6 +--
 ckanext/qa/reports.py                  | 10 ++---
 ckanext/qa/sniff_format.py             | 58 ++++++++++++++------------
 ckanext/qa/tasks.py                    | 30 ++++++-------
 ckanext/qa/tests/fake_ckan.py          |  4 +-
 ckanext/qa/tests/mock_remote_server.py |  9 ++--
 ckanext/qa/tests/test_link_checker.py  |  4 +-
 ckanext/qa/tests/test_sniff_format.py  |  4 +-
 ckanext/qa/tests/test_tasks.py         |  8 ++--
 18 files changed, 136 insertions(+), 108 deletions(-)
 create mode 100644 .flake8

diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..a89a787b
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,20 @@
+[flake8]
+# @see https://flake8.pycqa.org/en/latest/user/configuration.html?highlight=.flake8
+
+exclude =
+    ckan
+    scripts
+
+# Extended output format.
+format = pylint
+
+# Show the source of errors.
+show_source = True
+
+max-complexity = 10
+
+# List ignore rules one per line.
+ignore =
+    E501
+    C901
+    W503
diff --git a/ckanext/qa/bin/common.py b/ckanext/qa/bin/common.py
index 0ace784e..75e884fc 100644
--- a/ckanext/qa/bin/common.py
+++ b/ckanext/qa/bin/common.py
@@ -48,5 +48,5 @@ def get_resources(state='active', publisher_ref=None, resource_id=None, dataset_
         resources = resources.filter(model.Resource.id == resource_id)
         criteria.append('Resource:%s' % resource_id)
     resources = resources.all()
-    print '%i resources (%s)' % (len(resources), ' '.join(criteria))
+    print('%i resources (%s)' % (len(resources), ' '.join(criteria)))
     return resources
diff --git a/ckanext/qa/bin/migrate_task_status.py b/ckanext/qa/bin/migrate_task_status.py
index f9c7d59e..125190b1 100644
--- a/ckanext/qa/bin/migrate_task_status.py
+++ b/ckanext/qa/bin/migrate_task_status.py
@@ -59,7 +59,7 @@ def migrate(options):
         # time, so some timezone nonesense going on. Can't do much.
         archival = Archival.get_for_resource(res.id)
         if not archival:
-            print add_stat('QA but no Archival data', res, stats)
+            print(add_stat('QA but no Archival data', res, stats))
             continue
         archival_date = archival.updated
         # the state of the resource was as it was archived on the date of
@@ -112,10 +112,10 @@ def migrate(options):
                 model.Session.add(qa)
             add_stat('Added to QA table', res, stats)
 
-    print 'Summary\n', stats.report()
+    print('Summary\n', stats.report())
     if options.write:
         model.repo.commit_and_remove()
-        print 'Written'
+        print('Written')
 
 
 def add_stat(outcome, res, stats, extra_info=None):
@@ -154,10 +154,10 @@ def date_str_to_datetime_or_none(date_str):
     if len(args) != 1:
         parser.error('Wrong number of arguments (%i)' % len(args))
     config_ini = args[0]
-    print 'Loading CKAN config...'
+    print('Loading CKAN config...')
     common.load_config(config_ini)
     common.register_translator()
-    print 'Done'
+    print('Done')
     # Setup logging to print debug out for local only
     rootLogger = logging.getLogger()
     rootLogger.setLevel(logging.WARNING)
diff --git a/ckanext/qa/bin/running_stats.py b/ckanext/qa/bin/running_stats.py
index f4abe5ba..fbb0a635 100644
--- a/ckanext/qa/bin/running_stats.py
+++ b/ckanext/qa/bin/running_stats.py
@@ -14,7 +14,7 @@
         package_stats.increment('deleted')
     else:
         package_stats.increment('not deleted')
-print package_stats.report()
+print(package_stats.report())
 > deleted: 30
 > not deleted: 70
 
@@ -26,7 +26,7 @@
         package_stats.add('deleted', package.name)
     else:
         package_stats.add('not deleted' package.name)
-print package_stats.report()
+print(package_stats.report())
 > deleted: 30 pollution-uk, flood-regions, river-quality, ...
 > not deleted: 70 spending-bristol, ...
 
@@ -35,6 +35,7 @@
 import copy
 import datetime
 
+
 class StatsCount(dict):
     # {category:count}
     _init_value = 0
@@ -109,6 +110,6 @@ def report_value(self, category):
     package_stats.add('Success', 'good3')
     package_stats.add('Success', 'good4')
     package_stats.add('Failure', 'bad1')
-    print package_stats.report()
+    print(package_stats.report())
 
-    print StatsList().report()
+    print(StatsList().report())
diff --git a/ckanext/qa/commands.py b/ckanext/qa/commands.py
index 992fb0cd..d0b0b5ea 100644
--- a/ckanext/qa/commands.py
+++ b/ckanext/qa/commands.py
@@ -1,4 +1,5 @@
 import logging
+import six
 import sys
 
 from sqlalchemy import or_
@@ -65,7 +66,7 @@ def command(self):
         Parse command line arguments and call appropriate method.
         """
         if not self.args or self.args[0] in ['--help', '-h', 'help']:
-            print QACommand.__doc__
+            print(QACommand.__doc__)
             return
 
         cmd = self.args[0]
@@ -177,44 +178,44 @@ def sniff(self):
         from ckanext.qa.sniff_format import sniff_file_format
 
         if len(self.args) < 2:
-            print 'Not enough arguments', self.args
+            print('Not enough arguments', self.args)
             sys.exit(1)
         for filepath in self.args[1:]:
             format_ = sniff_file_format(
                 filepath, logging.getLogger('ckanext.qa.sniffer'))
             if format_:
-                print 'Detected as: %s - %s' % (format_['display_name'],
-                                                filepath)
+                print('Detected as: %s - %s' % (format_['display_name'],
+                                                filepath))
             else:
-                print 'ERROR: Could not recognise format of: %s' % filepath
+                print('ERROR: Could not recognise format of: %s' % filepath)
 
     def view(self, package_ref=None):
         from ckan import model
 
         q = model.Session.query(model.TaskStatus).filter_by(task_type='qa')
-        print 'QA records - %i TaskStatus rows' % q.count()
-        print '      across %i Resources' % q.distinct('entity_id').count()
+        print('QA records - %i TaskStatus rows' % q.count())
+        print('      across %i Resources' % q.distinct('entity_id').count())
 
         if package_ref:
             pkg = model.Package.get(package_ref)
-            print 'Package %s %s' % (pkg.name, pkg.id)
+            print('Package %s %s' % (pkg.name, pkg.id))
             for res in pkg.resources:
-                print 'Resource %s' % res.id
+                print('Resource %s' % res.id)
                 for row in q.filter_by(entity_id=res.id):
-                    print '* %s = %r error=%r' % (row.key, row.value,
-                                                  row.error)
+                    print('* %s = %r error=%r' % (row.key, row.value,
+                                                  row.error))
 
     def clean(self):
         from ckan import model
 
-        print 'Before:'
+        print('Before:')
         self.view()
 
         q = model.Session.query(model.TaskStatus).filter_by(task_type='qa')
         q.delete()
         model.Session.commit()
 
-        print 'After:'
+        print('After:')
         self.view()
 
     def migrate1(self):
@@ -223,32 +224,32 @@ def migrate1(self):
         q_status = model.Session.query(model.TaskStatus) \
             .filter_by(task_type='qa') \
             .filter_by(key='status')
-        print '* %s with "status" will be deleted e.g. %s' % (q_status.count(),
-                                                              q_status.first())
+        print('* %s with "status" will be deleted e.g. %s' % (q_status.count(),
+                                                              q_status.first()))
         q_failures = model.Session.query(model.TaskStatus) \
             .filter_by(task_type='qa') \
             .filter_by(key='openness_score_failure_count')
-        print '* %s with openness_score_failure_count to be deleted e.g.\n%s'\
-            % (q_failures.count(), q_failures.first())
+        print('* %s with openness_score_failure_count to be deleted e.g.\n%s'
+              % (q_failures.count(), q_failures.first()))
         q_score = model.Session.query(model.TaskStatus) \
             .filter_by(task_type='qa') \
             .filter_by(key='openness_score')
-        print '* %s with openness_score to migrate e.g.\n%s' % \
-            (q_score.count(), q_score.first())
+        print('* %s with openness_score to migrate e.g.\n%s' %
+              (q_score.count(), q_score.first()))
         q_reason = model.Session.query(model.TaskStatus) \
             .filter_by(task_type='qa') \
             .filter_by(key='openness_score_reason')
-        print '* %s with openness_score_reason to migrate e.g.\n%s' % \
-            (q_reason.count(), q_reason.first())
-        raw_input('Press Enter to continue')
+        print('* %s with openness_score_reason to migrate e.g.\n%s' %
+              (q_reason.count(), q_reason.first()))
+        six.input('Press Enter to continue')
 
         q_status.delete()
         model.Session.commit()
-        print '..."status" deleted'
+        print('..."status" deleted')
 
         q_failures.delete()
         model.Session.commit()
-        print '..."openness_score_failure_count" deleted'
+        print('..."openness_score_failure_count" deleted')
 
         for task_status in q_score:
             reason_task_status = q_reason \
@@ -265,15 +266,15 @@ def migrate1(self):
                 'reason': reason,
                 'format': None,
                 'is_broken': None,
-                })
+            })
             model.Session.commit()
-        print '..."openness_score" and "openness_score_reason" migrated'
+        print('..."openness_score" and "openness_score_reason" migrated')
 
         count = q_reason.count()
         q_reason.delete()
         model.Session.commit()
-        print '... %i remaining "openness_score_reason" deleted' % count
+        print('... %i remaining "openness_score_reason" deleted' % count)
 
         model.Session.flush()
         model.Session.remove()
-        print 'Migration succeeded'
+        print('Migration succeeded')
diff --git a/ckanext/qa/controllers.py b/ckanext/qa/controllers.py
index 493eed7f..4cedcbb5 100644
--- a/ckanext/qa/controllers.py
+++ b/ckanext/qa/controllers.py
@@ -102,7 +102,7 @@ def _check_link(self, url):
             result['mimetype'] = self._extract_mimetype(headers)
             result['size'] = headers.get('content-length', '')
             result['last_modified'] = self._parse_and_format_date(headers.get('last-modified', ''))
-        except LinkCheckerError, e:
+        except LinkCheckerError as e:
             result['url_errors'].append(str(e))
         return result
 
diff --git a/ckanext/qa/lib.py b/ckanext/qa/lib.py
index 2113badd..712a8741 100644
--- a/ckanext/qa/lib.py
+++ b/ckanext/qa/lib.py
@@ -55,7 +55,7 @@ def resource_format_scores():
         with open(json_filepath) as format_file:
             try:
                 file_resource_formats = json.loads(format_file.read())
-            except ValueError, e:
+            except ValueError as e:
                 # includes simplejson.decoder.JSONDecodeError
                 raise ValueError('Invalid JSON syntax in %s: %s' %
                                  (json_filepath, e))
@@ -90,7 +90,7 @@ def create_qa_update_package_task(package, queue):
     from pylons import config
     ckan_ini_filepath = os.path.abspath(config.__file__)
 
-    compat_enqueue('qa.update_package', tasks.update_package, queue,  args=[ckan_ini_filepath, package.id])
+    compat_enqueue('qa.update_package', tasks.update_package, queue, args=[ckan_ini_filepath, package.id])
     log.debug('QA of package put into celery queue %s: %s',
               queue, package.name)
 
diff --git a/ckanext/qa/logic/action.py b/ckanext/qa/logic/action.py
index 8914c670..e176a7d4 100644
--- a/ckanext/qa/logic/action.py
+++ b/ckanext/qa/logic/action.py
@@ -30,7 +30,7 @@ def qa_resource_show(context, data_dict):
         'name': pkg.name,
         'title': pkg.title,
         'id': res.id
-        }
+    }
     return_dict['archival'] = archival.as_dict()
     return_dict.update(qa.as_dict())
     return return_dict
diff --git a/ckanext/qa/model.py b/ckanext/qa/model.py
index 160b0ad4..94eafdc7 100644
--- a/ckanext/qa/model.py
+++ b/ckanext/qa/model.py
@@ -1,5 +1,6 @@
 import uuid
 import datetime
+import six
 
 from sqlalchemy import Column
 from sqlalchemy import types
@@ -15,7 +16,7 @@
 
 
 def make_uuid():
-    return unicode(uuid.uuid4())
+    return six.text_type(uuid.uuid4())
 
 
 class QA(Base):
@@ -40,7 +41,7 @@ class QA(Base):
 
     def __repr__(self):
         summary = 'score=%s format=%s' % (self.openness_score, self.format)
-        details = unicode(self.openness_score_reason).encode('unicode_escape')
+        details = six.text_type(self.openness_score_reason).encode('unicode_escape')
         package = model.Package.get(self.package_id)
         package_name = package.name if package else '?%s?' % self.package_id
         return '<QA %s /dataset/%s/resource/%s %s>' % \
diff --git a/ckanext/qa/plugin.py b/ckanext/qa/plugin.py
index 876459d1..cfd92766 100644
--- a/ckanext/qa/plugin.py
+++ b/ckanext/qa/plugin.py
@@ -67,7 +67,7 @@ def get_actions(self):
         return {
             'qa_resource_show': action.qa_resource_show,
             'qa_package_openness_show': action.qa_package_openness_show,
-            }
+        }
 
     # IAuthFunctions
 
@@ -75,7 +75,7 @@ def get_auth_functions(self):
         return {
             'qa_resource_show': auth.qa_resource_show,
             'qa_package_openness_show': auth.qa_package_openness_show,
-            }
+        }
 
     # ITemplateHelpers
 
@@ -85,7 +85,7 @@ def get_helpers(self):
             helpers.qa_openness_stars_resource_html,
             'qa_openness_stars_dataset_html':
             helpers.qa_openness_stars_dataset_html,
-            }
+        }
 
     # IPackageController
 
diff --git a/ckanext/qa/reports.py b/ckanext/qa/reports.py
index c50b56de..9da09a64 100644
--- a/ckanext/qa/reports.py
+++ b/ckanext/qa/reports.py
@@ -72,7 +72,7 @@ def openness_index(include_sub_organizations=False):
 
     table = []
     for org_name, org_counts in results.iteritems():
-        total_stars = sum([k*v for k, v in org_counts['score_counts'].items() if k])
+        total_stars = sum([k * v for k, v in org_counts['score_counts'].items() if k])
         num_pkgs_scored = sum([v for k, v in org_counts['score_counts'].items()
                               if k is not None])
         average_stars = round(float(total_stars) / num_pkgs_scored, 1) \
@@ -82,7 +82,7 @@ def openness_index(include_sub_organizations=False):
             ('organization_name', org_name),
             ('total_stars', total_stars),
             ('average_stars', average_stars),
-            ))
+        ))
         row.update(jsonify_counter(org_counts['score_counts']))
         table.append(row)
 
@@ -136,10 +136,10 @@ def openness_for_organization(organization=None, include_sub_organizations=False
                 ('organization_title', org.title),
                 ('openness_score', qa['openness_score']),
                 ('openness_score_reason', qa['openness_score_reason']),
-                )))
+            )))
             score_counts[qa['openness_score']] += 1
 
-    total_stars = sum([k*v for k, v in score_counts.items() if k])
+    total_stars = sum([k * v for k, v in score_counts.items() if k])
     num_pkgs_with_stars = sum([v for k, v in score_counts.items()
                                if k is not None])
     average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \
@@ -172,7 +172,7 @@ def openness_report_combinations():
     'option_combinations': openness_report_combinations,
     'generate': openness_report,
     'template': 'report/openness.html',
-    }
+}
 
 
 def jsonify_counter(counter):
diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py
index 856447fa..13b4dc6e 100644
--- a/ckanext/qa/sniff_format.py
+++ b/ckanext/qa/sniff_format.py
@@ -1,7 +1,9 @@
+# encoding: utf-8
 import re
 import zipfile
 import os
 from collections import defaultdict
+import six
 import subprocess
 import StringIO
 
@@ -16,6 +18,7 @@
 
 log = logging.getLogger(__name__)
 
+
 def sniff_file_format(filepath):
     '''For a given filepath, work out what file format it is.
 
@@ -33,12 +36,13 @@ def sniff_file_format(filepath):
     '''
     format_ = None
     log.info('Sniffing file format of: %s', filepath)
-    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
+    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, six.text_types) \
         else filepath
     mime_type = magic.from_file(filepath_utf8, mime=True)
     log.info('Magic detects file as: %s', mime_type)
     if mime_type:
-        if mime_type == 'application/xml':
+        # some operating systems magic mime xml as text/xml
+        if mime_type == 'application/xml' or mime_type == 'text/xml':
             with open(filepath) as f:
                 buf = f.read(5000)
             format_ = get_xml_variant_including_xml_declaration(buf)
@@ -139,14 +143,14 @@ def is_json(buf):
     JSON format.'''
     string = '"[^"]*"'
     string_re = re.compile(string)
-    number_re = re.compile('-?\d+(\.\d+)?([eE][+-]?\d+)?')
-    extra_values_re = re.compile('true|false|null')
-    object_start_re = re.compile('{%s:\s?' % string)
-    object_middle_re = re.compile('%s:\s?' % string)
-    object_end_re = re.compile('}')
-    comma_re = re.compile(',\s?')
-    array_start_re = re.compile('\[')
-    array_end_re = re.compile('\]')
+    number_re = re.compile(r'-?\d+(\.\d+)?([eE][+-]?\d+)?')
+    extra_values_re = re.compile(r'true|false|null')
+    object_start_re = re.compile(r'{%s:\s?' % string)
+    object_middle_re = re.compile(r'%s:\s?' % string)
+    object_end_re = re.compile(r'}')
+    comma_re = re.compile(r',\s?')
+    array_start_re = re.compile(r'\[')
+    array_end_re = re.compile(r'\]')
     any_value_regexs = [string_re, number_re, object_start_re, array_start_re, extra_values_re]
 
     # simplified state machine - just looks at stack of object/array and
@@ -256,7 +260,7 @@ def get_cells_per_row(num_cells, num_rows):
 
 def is_html(buf):
     '''If this buffer is HTML, return that format type, else None.'''
-    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>'
+    xml_re = r'.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>'
     match = re.match(xml_re, buf, re.IGNORECASE)
     if match:
         log.info('HTML tag detected')
@@ -266,7 +270,7 @@ def is_html(buf):
 
 def is_iati(buf):
     '''If this buffer is IATI format, return that format type, else None.'''
-    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>'
+    xml_re = r'.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>'
     match = re.match(xml_re, buf, re.IGNORECASE)
     if match:
         log.info('IATI tag detected')
@@ -277,13 +281,13 @@ def is_iati(buf):
 def is_xml_but_without_declaration(buf):
     '''Decides if this is a buffer of XML, but missing the usual <?xml ...?>
     tag.'''
-    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<([^>\s]*)([^>]*)>'
+    xml_re = r'.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<([^>\s]*)([^>]*)>'
     match = re.match(xml_re, buf, re.IGNORECASE)
     if match:
         top_level_tag_name, top_level_tag_attributes = match.groups()[-2:]
-        if 'xmlns:' not in top_level_tag_attributes and \
-            (len(top_level_tag_name) > 20 or
-             len(top_level_tag_attributes) > 200):
+        if ('xmlns:' not in top_level_tag_attributes
+                and (len(top_level_tag_name) > 20
+                     or len(top_level_tag_attributes) > 200)):
             log.debug('Not XML (without declaration) - unlikely length first tag: <%s %s>',
                       top_level_tag_name, top_level_tag_attributes)
             return False
@@ -318,9 +322,9 @@ def start_element(name, attrs):
     p.StartElementHandler = start_element
     try:
         p.Parse(buf)
-    except GotFirstTag, e:
-        top_level_tag_name = str(e).lower()
-    except xml.sax.SAXException, e:
+    except GotFirstTag as e:
+        top_level_tag_name = six.text_type(e).lower()
+    except xml.sax.SAXException as e:
         log.info('Sax parse error: %s %s', e, buf)
         return {'format': 'XML'}
 
@@ -354,8 +358,8 @@ def has_rdfa(buf):
         return False
 
     # more rigorous check for them as tag attributes
-    about_re = '<[^>]+\sabout="[^"]+"[^>]*>'
-    property_re = '<[^>]+\sproperty="[^"]+"[^>]*>'
+    about_re = r'<[^>]+\sabout="[^"]+"[^>]*>'
+    property_re = r'<[^>]+\sproperty="[^"]+"[^>]*>'
     # remove CR to catch tags spanning more than one line
     # buf = re.sub('\r\n', ' ', buf)
     if not re.search(about_re, buf):
@@ -381,11 +385,11 @@ def get_zipped_format(filepath):
             filepaths = zip.namelist()
         finally:
             zip.close()
-    except zipfile.BadZipfile, e:
+    except zipfile.BadZipfile as e:
         log.info('Zip file open raised error %s: %s',
                  e, e.args)
         return
-    except Exception, e:
+    except Exception as e:
         log.warning('Zip file open raised exception %s: %s',
                     e, e.args)
         return
@@ -438,7 +442,7 @@ def get_zipped_format(filepath):
 def is_excel(filepath):
     try:
         xlrd.open_workbook(filepath)
-    except Exception, e:
+    except Exception as e:
         log.info('Not Excel - failed to load: %s %s', e, e.args)
         return False
     else:
@@ -534,12 +538,12 @@ def turtle_regex():
     '''
     global turtle_regex_
     if not turtle_regex_:
-        rdf_term = '(<[^ >]+>|_:\S+|".+?"(@\w+)?(\^\^\S+)?|\'.+?\'(@\w+)?(\^\^\S+)?|""".+?"""(@\w+)' \
-                   '?(\^\^\S+)?|\'\'\'.+?\'\'\'(@\w+)?(\^\^\S+)?|[+-]?([0-9]+|[0-9]*\.[0-9]+)(E[+-]?[0-9]+)?|false|true)'
+        rdf_term = r'(<[^ >]+>|_:\S+|".+?"(@\w+)?(\^\^\S+)?|\'.+?\'(@\w+)?(\^\^\S+)?|""".+?"""(@\w+)' \
+                   r'?(\^\^\S+)?|\'\'\'.+?\'\'\'(@\w+)?(\^\^\S+)?|[+-]?([0-9]+|[0-9]*\.[0-9]+)(E[+-]?[0-9]+)?|false|true)'
 
         # simple case is: triple_re = '^T T T \.$'.replace('T', rdf_term)
         # but extend to deal with multiple predicate-objects:
         # triple = '^T T T\s*(;\s*T T\s*)*\.\s*$'.replace('T', rdf_term).replace(' ', '\s+')
-        triple = '(^T|;)\s*T T\s*(;|\.\s*$)'.replace('T', rdf_term).replace(' ', '\s+')
+        triple = r'(^T|;)\s*T T\s*(;|\.\s*$)'.replace('T', rdf_term).replace(' ', r'\s+')
         turtle_regex_ = re.compile(triple, re.MULTILINE)
     return turtle_regex_
diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index d96c954e..a9a857e8 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -6,6 +6,7 @@
 import json
 import math
 import os
+import six
 import tempfile
 import time
 import traceback
@@ -129,9 +130,9 @@ def update_package(ckan_ini_filepath, package_id):
 
     try:
         update_package_(package_id)
-    except Exception, e:
+    except Exception as e:
         log.error('Exception occurred during QA update_package: %s: %s',
-                  e.__class__.__name__,  unicode(e))
+                  e.__class__.__name__, e)
         raise
 
 
@@ -168,9 +169,9 @@ def update(ckan_ini_filepath, resource_id):
     load_config(ckan_ini_filepath)
     try:
         update_resource_(resource_id)
-    except Exception, e:
+    except Exception as e:
         log.error('Exception occurred during QA update_resource: %s: %s',
-                  e.__class__.__name__,  unicode(e))
+                  e.__class__.__name__, e)
         raise
 
 
@@ -267,10 +268,10 @@ def resource_score(resource):
                             format_ = get_qa_format(resource.id)
         score_reason = ' '.join(score_reasons)
         format_ = format_ or None
-    except Exception, e:
+    except Exception as e:
         log.error('Unexpected error while calculating openness score %s: %s\nException: %s',
-                  e.__class__.__name__,  unicode(e), traceback.format_exc())
-        score_reason = _("Unknown error: %s") % str(e)
+                  e.__class__.__name__, e, traceback.format_exc())
+        score_reason = _("Unknown error: %s") % e
         raise
 
     # Even if we can get the link, we should still treat the resource
@@ -310,7 +311,7 @@ def format_date(date):
         else:
             return ''
     messages = [_('File could not be downloaded.'),
-                _('Reason') + ':', unicode(archival.status) + '.',
+                _('Reason') + ':', six.text_type(archival.status) + '.',
                 _('Error details: %s.') % archival.reason,
                 _('Attempted on %s.') % format_date(archival.updated)]
     last_success = format_date(archival.last_success)
@@ -423,7 +424,6 @@ def _download_url(url):
     log.info('Fetching from: {0}'.format(url))
     tmp_file = get_tmp_file(url)
     length = 0
-    cl = None
     try:
         headers = {}
         response = get_response(url, headers)
@@ -441,16 +441,16 @@ def _download_url(url):
         log.debug('HTTP error: {}'.format(error))
         tmp_file.close()
         os.remove(tmp_file.name)
-        raise HTTPError(
-            "Received a bad HTTP response when trying to download "
-            "the data file", status_code=error.response.status_code,
+        raise requests.exceptions.HTTPError(
+            "Received a bad HTTP response when trying to download the data file",
+            status_code=error.response.status_code,
             request_url=url, response=error)
     except requests.exceptions.Timeout:
         log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
         tmp_file.close()
         os.remove(tmp_file.name)
         raise IOError('Connection timed out after {}s'.format(
-                       DOWNLOAD_TIMEOUT))
+                      DOWNLOAD_TIMEOUT))
     except requests.exceptions.RequestException as e:
         try:
             err_message = str(e.reason)
@@ -459,7 +459,7 @@ def _download_url(url):
         log.warning('URL error: {}'.format(err_message))
         tmp_file.close()
         os.remove(tmp_file.name)
-        raise HTTPError(
+        raise requests.exceptions.HTTPError(
             message=err_message, status_code=None,
             request_url=url, response=None)
 
@@ -471,7 +471,7 @@ def _download_url(url):
 def get_response(url, headers):
     def get_url():
         kwargs = {'headers': headers, 'timeout': DOWNLOAD_TIMEOUT,
-                  'verify': SSL_VERIFY, 'stream': True} # just gets the headers for now
+                  'verify': SSL_VERIFY, 'stream': True}  # just gets the headers for now
         if 'ckan.download_proxy' in config:
             proxy = config.get('ckan.download_proxy')
             kwargs['proxies'] = {'http': proxy, 'https': proxy}
diff --git a/ckanext/qa/tests/fake_ckan.py b/ckanext/qa/tests/fake_ckan.py
index 30b85601..c8434cbc 100644
--- a/ckanext/qa/tests/fake_ckan.py
+++ b/ckanext/qa/tests/fake_ckan.py
@@ -10,12 +10,12 @@
                     'last_success': '2008-10-01',
                     'first_failure': '',
                     'failure_count': 0,
-                    }),
+                }),
                 'stack': '',
                 'last_updated': '2008-10-10T19:30:37.536836',
                 }
      }
-    )
+)
 
 request_store = []
 task_status = {'archiver': TASK_STATUS_ARCHIVER_OK,
diff --git a/ckanext/qa/tests/mock_remote_server.py b/ckanext/qa/tests/mock_remote_server.py
index b43fb77d..9c59ed5f 100644
--- a/ckanext/qa/tests/mock_remote_server.py
+++ b/ckanext/qa/tests/mock_remote_server.py
@@ -7,6 +7,7 @@
 from time import sleep
 from wsgiref.simple_server import make_server
 import urllib2
+import six
 import socket
 
 
@@ -37,7 +38,7 @@ def serve(self, host='localhost', port_range=(8000, 9000)):
         This uses context manager to make sure the server is stopped::
 
             >>> with MockTestServer().serve() as addr:
-            ...     print urllib2.urlopen('%s/?content=hello+world').read()
+            ...     print(urllib2.urlopen('%s/?content=hello+world').read())
             ...
             'hello world'
         """
@@ -80,8 +81,8 @@ def get_content(cls, varspec):
         called and its return value used.
         """
         modpath, var = varspec.split(':')
-        mod = reduce(getattr, modpath.split('.')[1:], __import__(modpath))
-        var = reduce(getattr, var.split('.'), mod)
+        mod = six.moves.reduce(getattr, modpath.split('.')[1:], __import__(modpath))
+        var = six.moves.reduce(getattr, var.split('.'), mod)
         try:
             return var()
         except TypeError:
@@ -116,7 +117,7 @@ def __call__(self, environ, start_response):
         else:
             content = request.str_params.get('content', '')
 
-        if isinstance(content, unicode):
+        if isinstance(content, six.string_types):
             raise TypeError("Expected raw byte string for content")
 
         headers = [
diff --git a/ckanext/qa/tests/test_link_checker.py b/ckanext/qa/tests/test_link_checker.py
index 550a016e..cd8a79a6 100644
--- a/ckanext/qa/tests/test_link_checker.py
+++ b/ckanext/qa/tests/test_link_checker.py
@@ -121,12 +121,12 @@ def test_colon_in_query_string(self, url):
         # accept, because browsers accept this
         # see discussion: http://trac.ckan.org/ticket/318
         result = self.check_link(url)
-        print result
+        print(result)
         assert_equal(result['url_errors'], [])
 
     @with_mock_url('?status=200 ')
     def test_trailing_whitespace(self, url):
         # accept, because browsers accept this
         result = self.check_link(url)
-        print result
+        print(result)
         assert_equal(result['url_errors'], [])
diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py
index f7b86577..c3b7f279 100644
--- a/ckanext/qa/tests/test_sniff_format.py
+++ b/ckanext/qa/tests/test_sniff_format.py
@@ -292,5 +292,5 @@ def test_turtle_regex():
 
 def test_is_ttl__num_triples():
     triple = '<subject> <predicate> <object>; <predicate> <object>.'
-    assert not is_ttl('\n'.join([triple]*2))
-    assert is_ttl('\n'.join([triple]*5))
+    assert not is_ttl('\n'.join([triple] * 2))
+    assert is_ttl('\n'.join([triple] * 5))
diff --git a/ckanext/qa/tests/test_tasks.py b/ckanext/qa/tests/test_tasks.py
index 4a9bf3bd..9f59f207 100644
--- a/ckanext/qa/tests/test_tasks.py
+++ b/ckanext/qa/tests/test_tasks.py
@@ -78,7 +78,7 @@ def test_trigger_on_archival(cls):
         context = {'model': model, 'ignore_auth': True, 'session': model.Session, 'user': 'test'}
         pkg = {'name': 'testpkg', 'license_id': 'uk-ogl', 'resources': [
             {'url': 'http://test.com/', 'format': 'CSV', 'description': 'Test'}
-            ]}
+        ]}
         pkg = get_action('package_create')(context, pkg)
         resource_dict = pkg['resources'][0]
         res_id = resource_dict['id']
@@ -304,7 +304,7 @@ def get_qa_result(cls, **kwargs):
             'openness_score_reason': 'Detected as CSV which scores 3',
             'format': 'CSV',
             'archival_timestamp': datetime.datetime(2015, 12, 16),
-            }
+        }
         qa_result.update(kwargs)
         return qa_result
 
@@ -335,7 +335,7 @@ def test_simple(self):
             'url': 'http://example.com/file.csv',
             'title': 'Some data',
             'format': '',
-            }
+        }
         dataset = ckan_factories.Dataset(resources=[resource])
         resource = model.Resource.get(dataset['resources'][0]['id'])
 
@@ -359,7 +359,7 @@ def test_simple(self):
             'url': 'http://example.com/file.csv',
             'title': 'Some data',
             'format': '',
-            }
+        }
         dataset = ckan_factories.Dataset(resources=[resource])
         resource = model.Resource.get(dataset['resources'][0]['id'])
 

From c9a605b116d490b3039292cb8e28f9e7eb301570 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 14:05:24 +1000
Subject: [PATCH 10/28] update requirements

- patch xlrd
- don't force versions of ckanext-archiver and ckanext-report, just verify they're present
---
 requirements.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 70da4e40..0c9c70c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,10 @@
-xlrd==1.0.0
-python-magic==0.4.12
+xlrd==1.1.0
+#python-magic==0.4.15 #in ckancore
 messytables==0.15.2
 progressbar==2.3
+#SQLAlchemy>=0.6.6 #in ckancore
+#requests==2.11.1 #in ckancore
+six>=1.0.0 #in ckancore
+
+ckanext-archiver
+ckanext-report

From ab0889a7f5d317639fc6b59b38f302fdd90185c1 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 11 Feb 2021 15:49:14 +1000
Subject: [PATCH 11/28] [QOL-6491] fix HTTPError syntax

---
 ckanext/qa/tasks.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index a9a857e8..1142e6ea 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -442,9 +442,8 @@ def _download_url(url):
         tmp_file.close()
         os.remove(tmp_file.name)
         raise requests.exceptions.HTTPError(
-            "Received a bad HTTP response when trying to download the data file",
-            status_code=error.response.status_code,
-            request_url=url, response=error)
+            url, error.response.status_code,
+            "Received a bad HTTP response when trying to download the data file")
     except requests.exceptions.Timeout:
         log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
         tmp_file.close()
@@ -459,9 +458,7 @@ def _download_url(url):
         log.warning('URL error: {}'.format(err_message))
         tmp_file.close()
         os.remove(tmp_file.name)
-        raise requests.exceptions.HTTPError(
-            message=err_message, status_code=None,
-            request_url=url, response=None)
+        raise requests.exceptions.HTTPError(url, None, err_message)
 
     log.info('Downloaded ok - %s', printable_file_size(length))
     tmp_file.seek(0)

From ea73fb8618929348d1344d8201789b093f3ed040 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Thu, 11 Feb 2021 16:07:33 +1000
Subject: [PATCH 12/28] [QOL-6491] fix HTTPError argument order

---
 ckanext/qa/tasks.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ckanext/qa/tasks.py b/ckanext/qa/tasks.py
index 1142e6ea..c33d612e 100644
--- a/ckanext/qa/tasks.py
+++ b/ckanext/qa/tasks.py
@@ -442,8 +442,9 @@ def _download_url(url):
         tmp_file.close()
         os.remove(tmp_file.name)
         raise requests.exceptions.HTTPError(
-            url, error.response.status_code,
-            "Received a bad HTTP response when trying to download the data file")
+            error.response.status_code,
+            "Received a bad HTTP response when trying to download the data file",
+            url)
     except requests.exceptions.Timeout:
         log.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
         tmp_file.close()
@@ -458,7 +459,7 @@ def _download_url(url):
         log.warning('URL error: {}'.format(err_message))
         tmp_file.close()
         os.remove(tmp_file.name)
-        raise requests.exceptions.HTTPError(url, None, err_message)
+        raise requests.exceptions.HTTPError(None, err_message, url)
 
     log.info('Downloaded ok - %s', printable_file_size(length))
     tmp_file.seek(0)

From b19afa4cdfa1ac98d55891e55c2100f21725ccd5 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 12 Feb 2021 09:04:32 +1000
Subject: [PATCH 13/28] [QOL-6491] oops fix six field name

---
 ckanext/qa/sniff_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py
index 13b4dc6e..e1d5869a 100644
--- a/ckanext/qa/sniff_format.py
+++ b/ckanext/qa/sniff_format.py
@@ -36,7 +36,7 @@ def sniff_file_format(filepath):
     '''
     format_ = None
     log.info('Sniffing file format of: %s', filepath)
-    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, six.text_types) \
+    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, six.string_types) \
         else filepath
     mime_type = magic.from_file(filepath_utf8, mime=True)
     log.info('Magic detects file as: %s', mime_type)

From cf9bc028b97975dc255998cdf6e19b8e3bef09bc Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 12 Feb 2021 12:27:27 +1000
Subject: [PATCH 14/28] sync mock server with the one from ckanext-archiver,
 including fixing text vs binary typing

---
 ckanext/qa/tests/mock_remote_server.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/ckanext/qa/tests/mock_remote_server.py b/ckanext/qa/tests/mock_remote_server.py
index 9c59ed5f..3761a90d 100644
--- a/ckanext/qa/tests/mock_remote_server.py
+++ b/ckanext/qa/tests/mock_remote_server.py
@@ -97,7 +97,8 @@ class MockEchoTestServer(MockHTTPServer):
         a 500 error response: 'http://localhost/?status=500'
 
         a 200 OK response, returning the function's docstring:
-         'http://localhost/?status=200;content-type=text/plain;content_var=ckan.tests.lib.test_package_search:test_wsgi_app.__doc__'
+        'http://localhost/?status=200;content-type=text/plain;content_var
+        =ckan.tests.lib.test_package_search:test_wsgi_app.__doc__'
 
     To specify content, use:
 
@@ -114,10 +115,16 @@ def __call__(self, environ, start_response):
         if 'content_var' in request.str_params:
             content = request.str_params.get('content_var')
             content = self.get_content(content)
+        elif 'content_long' in request.str_params:
+            content = '*' * 1000001
         else:
             content = request.str_params.get('content', '')
+        if 'method' in request.str_params \
+                and request.method.lower() != request.str_params['method'].lower():
+            content = ''
+            status = 405
 
-        if isinstance(content, six.string_types):
+        if isinstance(content, six.text_type):
             raise TypeError("Expected raw byte string for content")
 
         headers = [
@@ -125,8 +132,11 @@ def __call__(self, environ, start_response):
             for item in request.str_params.items()
             if item[0] not in ('content', 'status')
         ]
-        if content:
-            headers += [('Content-Length', str(len(content)))]
+        if 'length' in request.str_params:
+            cl = request.str_params.get('length')
+            headers += [('Content-Length', cl)]
+        elif content and 'no-content-length' not in request.str_params:
+            headers += [('Content-Length', six.binary_type(len(content)))]
         start_response(
             '%d %s' % (status, responses[status]),
             headers

From 1454f6e986ebef8bf82b2043f93c9472127d7a6d Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 12 Feb 2021 13:02:35 +1000
Subject: [PATCH 15/28] improve assertion error message

---
 ckanext/qa/tests/test_sniff_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py
index c3b7f279..beb7b7b5 100644
--- a/ckanext/qa/tests/test_sniff_format.py
+++ b/ckanext/qa/tests/test_sniff_format.py
@@ -30,7 +30,7 @@ def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath):
         '''Given a filepath, checks the sniffed format matches the format_extension.'''
         expected_format = format_extension
         sniffed_format = sniff_file_format(filepath)
-        assert sniffed_format, expected_format
+        assert sniffed_format, "Expected {} but failed to sniff any format: {}".format(expected_format, sniffed_format)
         expected_format_without_zip = expected_format.replace('.zip', '')
         assert_equal(sniffed_format['format'].lower(), expected_format_without_zip)
 

From 3ad03ff59915f7c15ea86a57c54bf38fda76088c Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 12 Feb 2021 13:27:45 +1000
Subject: [PATCH 16/28] improve error messages for testing file sniffing

---
 ckanext/qa/tests/test_sniff_format.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py
index beb7b7b5..2d1a35de 100644
--- a/ckanext/qa/tests/test_sniff_format.py
+++ b/ckanext/qa/tests/test_sniff_format.py
@@ -30,7 +30,7 @@ def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath):
         '''Given a filepath, checks the sniffed format matches the format_extension.'''
         expected_format = format_extension
         sniffed_format = sniff_file_format(filepath)
-        assert sniffed_format, "Expected {} but failed to sniff any format: {}".format(expected_format, sniffed_format)
+        assert sniffed_format, "Expected {} but failed to sniff any format for file: {}".format(expected_format, filepath)
         expected_format_without_zip = expected_format.replace('.zip', '')
         assert_equal(sniffed_format['format'].lower(), expected_format_without_zip)
 
@@ -49,16 +49,11 @@ def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath):
     def check_format(cls, format, filename=None):
         for format_extension, filepath in cls.fixture_files:
             if format_extension == format:
-                if filename:
-                    if filename in filepath:
-                        break
-                    else:
-                        continue
-                else:
-                    break
+                if not filename or filename in filepath:
+                    cls.assert_file_has_format_sniffed_correctly(format_extension, filepath)
+                break
         else:
             assert 0, format  # Could not find fixture for format
-        cls.assert_file_has_format_sniffed_correctly(format_extension, filepath)
 
     def test_xls(self):
         self.check_format('xls', '10-p108-data-results')

From 0773ee70dde1d9189a92c8df42b09fa18b25f9ad Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 12 Feb 2021 13:40:53 +1000
Subject: [PATCH 17/28] oops fix loop termination logic

---
 ckanext/qa/tests/test_sniff_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/qa/tests/test_sniff_format.py b/ckanext/qa/tests/test_sniff_format.py
index 2d1a35de..86039bc8 100644
--- a/ckanext/qa/tests/test_sniff_format.py
+++ b/ckanext/qa/tests/test_sniff_format.py
@@ -51,7 +51,7 @@ def check_format(cls, format, filename=None):
             if format_extension == format:
                 if not filename or filename in filepath:
                     cls.assert_file_has_format_sniffed_correctly(format_extension, filepath)
-                break
+                    break
         else:
             assert 0, format  # Could not find fixture for format
 

From 7533f13ef1eafac353dbd11f3ab1bcb7f3cbec0d Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 12 Feb 2021 14:53:46 +1000
Subject: [PATCH 18/28] use BSD 'file' fallback more consistently when other
 methods fail

---
 ckanext/qa/sniff_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/qa/sniff_format.py b/ckanext/qa/sniff_format.py
index e1d5869a..2a3b8a20 100644
--- a/ckanext/qa/sniff_format.py
+++ b/ckanext/qa/sniff_format.py
@@ -124,7 +124,7 @@ def sniff_file_format(filepath):
                 if has_rdfa(buf):
                     format_ = {'format': 'RDFa'}
 
-    else:
+    if not format_:
         # Excel files sometimes not picked up by magic, so try alternative
         if is_excel(filepath):
             format_ = {'format': 'XLS'}

From 7fcf5096d1e0e1b6673797457e3179a6ba126453 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 14:14:37 +1000
Subject: [PATCH 19/28] move requirements to a separate file instead of
 setup.py

---
 setup.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index f6012055..779aed3d 100644
--- a/setup.py
+++ b/setup.py
@@ -17,20 +17,16 @@
     include_package_data=True,
     zip_safe=False,
     install_requires=[
-        'ckanext-archiver>=2.0',
-        'ckanext-report',
-        'SQLAlchemy>=0.6.6',
-        'requests',
-        'xlrd>=0.8.0',
-        'messytables>=0.8',
-        'python-magic>=0.4',
-        'progressbar',
-        'six>=1.9' # until messytables->html5lib releases https://github.com/html5lib/html5lib-python/pull/301
+      # CKAN extensions should not list dependencies here, but in a separate
+      # ``requirements.txt`` file.
+      #
+      # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt
     ],
     tests_require=[
-        'nose',
-        'mock',
-        'flask'
+      # CKAN extensions should not list dependencies here, but in a separate
+      # ``dev-requirements.txt`` file.
+      #
+      # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt
     ],
     entry_points='''
     [paste.paster_command]

From 5c753fbb241697c222b67d7bcc7d56d6f1b0e944 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 14:25:00 +1000
Subject: [PATCH 20/28] update version number

- new minor version, because it introduces the download proxy and retrieving remote cached files as new features
---
 ckanext/qa/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/qa/__init__.py b/ckanext/qa/__init__.py
index 53fd0507..21f26a28 100644
--- a/ckanext/qa/__init__.py
+++ b/ckanext/qa/__init__.py
@@ -6,4 +6,4 @@
     import pkgutil
     __path__ = pkgutil.extend_path(__path__, __name__)
 
-__version__ = '2.0'
+__version__ = '2.1.0-rc1'

From 6ca9dc4a518648e121f86278d14015a222f08f4d Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 14:41:46 +1000
Subject: [PATCH 21/28] fix Travis build scripts

- install newer setuptools if needed
- install transitive dependencies of ckanext-archiver and ckanext-report
- start to recognise Python 3
---
 bin/travis-build.bash | 48 ++++++++++++++++++++++++++++++-------------
 bin/travis-run.sh     |  2 +-
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/bin/travis-build.bash b/bin/travis-build.bash
index fa1b072c..284ebb82 100644
--- a/bin/travis-build.bash
+++ b/bin/travis-build.bash
@@ -6,7 +6,16 @@ echo "This is travis-build.bash..."
 
 echo "Installing the packages that CKAN requires..."
 sudo apt-get update -qq
-sudo apt-get install solr-jetty libcommons-fileupload-java
+sudo apt-get install -y solr-jetty libcommons-fileupload-java
+
+ver=$(python -c"import sys; print(sys.version_info.major)")
+if [ $ver -eq 2 ]; then
+    echo "python version 2"
+elif [ $ver -eq 3 ]; then
+    echo "python version 3"
+else
+    echo "Unknown python version: $ver"
+fi
 
 echo "Upgrading libmagic for ckanext-qa..."
 # appears to upgrade it from 5.09-2 to 5.09-2ubuntu0.6 which seems to help the tests
@@ -16,6 +25,10 @@ echo "Installing CKAN and its Python dependencies..."
 git clone https://github.com/ckan/ckan
 cd ckan
 
+if [ $ver -eq 3 ]; then
+    pip install -r requirement-setuptools.txt
+fi
+
 if [ $CKANVERSION == 'master' ]
 then
     echo "CKAN version: master"
@@ -25,14 +38,14 @@ else
     echo "CKAN version: ${CKAN_TAG#ckan-}"
 fi
 
-python setup.py develop
-if [ -f requirements-py2.txt ]
+if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]
 then
     pip install -r requirements-py2.txt
 else
     pip install -r requirements.txt
 fi
 pip install -r dev-requirements.txt --allow-all-external
+python setup.py develop
 cd -
 
 echo "Setting up Solr..."
@@ -54,22 +67,29 @@ paster db init -c test-core.ini
 cd -
 
 echo "Installing dependency ckanext-report and its requirements..."
-pip install -e git+https://github.com/datagovuk/ckanext-report.git#egg=ckanext-report
+git clone --depth=50 https://github.com/datagovuk/ckanext-report.git
+cd ckanext-report
+  if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then
+    pip install -r requirements-py2.txt
+  elif [ -f requirements.txt ]; then
+    pip install -r requirements.txt
+  fi
+  pip install --no-deps -e .
+cd -
 
 echo "Installing dependency ckanext-archiver and its requirements..."
-git clone https://github.com/ckan/ckanext-archiver.git
+git clone --depth=50 https://github.com/ckan/ckanext-archiver.git
 cd ckanext-archiver
-pip install -e .
-pip install -r requirements.txt
+  if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then
+    pip install -r requirements-py2.txt
+  elif [ -f requirements.txt ]; then
+    pip install -r requirements.txt
+  fi
+  pip install --no-deps -e .
 cd -
 
-echo "Installing ckanext-qa and its requirements..."
-python setup.py develop
-pip install -r requirements.txt
-pip install -r dev-requirements.txt
-
 echo "Moving test-core.ini into a subdir..."
-mkdir subdir
-mv test-core.ini subdir
+mkdir -p subdir
+cp test-core.ini subdir
 
 echo "travis-build.bash is done."
diff --git a/bin/travis-run.sh b/bin/travis-run.sh
index 5c4022b7..1a6e7ef3 100644
--- a/bin/travis-run.sh
+++ b/bin/travis-run.sh
@@ -3,4 +3,4 @@
 echo "NO_START=0\nJETTY_HOST=127.0.0.1\nJETTY_PORT=8983\nJAVA_HOME=$JAVA_HOME" | sudo tee /etc/default/jetty
 sudo cp ckan/ckan/config/solr/schema.xml /etc/solr/conf/schema.xml
 sudo service jetty restart
-nosetests --with-pylons=subdir/test-core.ini --with-coverage --cover-package=ckanext.archiver --cover-inclusive --cover-erase --cover-tests
+nosetests --with-pylons=subdir/test-core.ini --with-coverage --cover-package=ckanext.qa --cover-inclusive --cover-erase --cover-tests

From e65cb4f72a0cae3479efdabf3c81cd7e6c9b4725 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 15:06:00 +1000
Subject: [PATCH 22/28] fix condition on updating setuptools for TravisCI

---
 bin/travis-build.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/travis-build.bash b/bin/travis-build.bash
index 284ebb82..4687ec49 100644
--- a/bin/travis-build.bash
+++ b/bin/travis-build.bash
@@ -25,7 +25,7 @@ echo "Installing CKAN and its Python dependencies..."
 git clone https://github.com/ckan/ckan
 cd ckan
 
-if [ $ver -eq 3 ]; then
+if [ -f requirement-setuptools.txt ]; then
     pip install -r requirement-setuptools.txt
 fi
 

From b2a0c23a90dea9f0ff40a20b9fbbf4d105bcb569 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 15:15:55 +1000
Subject: [PATCH 23/28] fix paster reference to ckan for testing

---
 bin/travis-build.bash | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bin/travis-build.bash b/bin/travis-build.bash
index 4687ec49..1a57798e 100644
--- a/bin/travis-build.bash
+++ b/bin/travis-build.bash
@@ -62,9 +62,7 @@ sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';"
 sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;'
 
 echo "Initialising the database..."
-cd ckan
-paster db init -c test-core.ini
-cd -
+paster --plugin=ckan db init -c test-core.ini
 
 echo "Installing dependency ckanext-report and its requirements..."
 git clone --depth=50 https://github.com/datagovuk/ckanext-report.git
@@ -88,6 +86,11 @@ cd ckanext-archiver
   pip install --no-deps -e .
 cd -
 
+echo "Installing ckanext-qa and its requirements..."
+pip install -r requirements.txt
+pip install -r dev-requirements.txt
+python setup.py develop
+
 echo "Moving test-core.ini into a subdir..."
 mkdir -p subdir
 cp test-core.ini subdir

From 511b2d2e7ea088bf610d9ae179325d78dd01675e Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 15:30:53 +1000
Subject: [PATCH 24/28] move DB operations together in TravisCI

---
 bin/travis-build.bash | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/bin/travis-build.bash b/bin/travis-build.bash
index 1a57798e..d44eee00 100644
--- a/bin/travis-build.bash
+++ b/bin/travis-build.bash
@@ -23,7 +23,7 @@ sudo apt-get install libmagic1
 
 echo "Installing CKAN and its Python dependencies..."
 git clone https://github.com/ckan/ckan
-cd ckan
+pushd ckan
 
 if [ -f requirement-setuptools.txt ]; then
     pip install -r requirement-setuptools.txt
@@ -46,7 +46,15 @@ else
 fi
 pip install -r dev-requirements.txt --allow-all-external
 python setup.py develop
-cd -
+
+echo "Creating the PostgreSQL user and database..."
+sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';"
+sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;'
+
+echo "Initialising the database..."
+paster db init -c test-core.ini
+
+popd
 
 echo "Setting up Solr..."
 # solr is multicore for tests on ckan master now, but it's easier to run tests
@@ -57,34 +65,27 @@ printf "NO_START=0\nJETTY_HOST=127.0.0.1\nJETTY_PORT=8983\nJAVA_HOME=$JAVA_HOME"
 sudo cp ckan/ckan/config/solr/schema.xml /etc/solr/conf/schema.xml
 sudo service jetty restart
 
-echo "Creating the PostgreSQL user and database..."
-sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';"
-sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;'
-
-echo "Initialising the database..."
-paster --plugin=ckan db init -c test-core.ini
-
 echo "Installing dependency ckanext-report and its requirements..."
 git clone --depth=50 https://github.com/datagovuk/ckanext-report.git
-cd ckanext-report
+pushd ckanext-report
   if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then
     pip install -r requirements-py2.txt
   elif [ -f requirements.txt ]; then
     pip install -r requirements.txt
   fi
   pip install --no-deps -e .
-cd -
+popd
 
 echo "Installing dependency ckanext-archiver and its requirements..."
 git clone --depth=50 https://github.com/ckan/ckanext-archiver.git
-cd ckanext-archiver
+pushd ckanext-archiver
   if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]; then
     pip install -r requirements-py2.txt
   elif [ -f requirements.txt ]; then
     pip install -r requirements.txt
   fi
   pip install --no-deps -e .
-cd -
+popd
 
 echo "Installing ckanext-qa and its requirements..."
 pip install -r requirements.txt

From e187a1d36f8638654f059941f9ee02c2dfab06f7 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 15:49:00 +1000
Subject: [PATCH 25/28] use 'ckan' instead of 'paster' for CKAN 2.9+

---
 bin/travis-build.bash | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/bin/travis-build.bash b/bin/travis-build.bash
index d44eee00..b6a21d86 100644
--- a/bin/travis-build.bash
+++ b/bin/travis-build.bash
@@ -32,13 +32,15 @@ fi
 if [ $CKANVERSION == 'master' ]
 then
     echo "CKAN version: master"
+    export CKAN_MINOR_VERSION=100
 else
+    export CKAN_MINOR_VERSION=${CKANVERSION##*.}
     CKAN_TAG=$(git tag | grep ^ckan-$CKANVERSION | sort --version-sort | tail -n 1)
     git checkout $CKAN_TAG
     echo "CKAN version: ${CKAN_TAG#ckan-}"
 fi
 
-if [ -f requirements-py2.txt ] && [ $ver -eq 2 ]
+if (( $CKAN_MINOR_VERSION >= 9 )) && (( $ver = 2 ))
 then
     pip install -r requirements-py2.txt
 else
@@ -52,7 +54,12 @@ sudo -u postgres psql -c "CREATE USER ckan_default WITH PASSWORD 'pass';"
 sudo -u postgres psql -c 'CREATE DATABASE ckan_test WITH OWNER ckan_default;'
 
 echo "Initialising the database..."
-paster db init -c test-core.ini
+if (( $CKAN_MINOR_VERSION >= 9 ))
+then
+    ckan -c test-core.ini db init
+else
+    paster db init -c test-core.ini
+fi
 
 popd
 

From 686a71dad3cf5a9bd6af0e37034b3f98ea795776 Mon Sep 17 00:00:00 2001
From: antuarc <carl.antuar@smartservice.qld.gov.au>
Date: Fri, 5 Mar 2021 15:57:02 +1000
Subject: [PATCH 26/28] use 'master' if CKANVERSION is unspecified

---
 bin/travis-build.bash | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/travis-build.bash b/bin/travis-build.bash
index b6a21d86..1805f7ab 100644
--- a/bin/travis-build.bash
+++ b/bin/travis-build.bash
@@ -29,7 +29,7 @@ if [ -f requirement-setuptools.txt ]; then
     pip install -r requirement-setuptools.txt
 fi
 
-if [ $CKANVERSION == 'master' ]
+if [ ${CKANVERSION:-master} == 'master' ]
 then
     echo "CKAN version: master"
     export CKAN_MINOR_VERSION=100
@@ -40,7 +40,7 @@ else
     echo "CKAN version: ${CKAN_TAG#ckan-}"
 fi
 
-if (( $CKAN_MINOR_VERSION >= 9 )) && (( $ver = 2 ))
+if (( "$CKAN_MINOR_VERSION" >= 9 )) && (( $ver = 2 ))
 then
     pip install -r requirements-py2.txt
 else

From ab60ac8bacd63c4baef674ae03fe5527b0954124 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Mon, 8 Mar 2021 13:08:13 +1000
Subject: [PATCH 27/28] add draft Github Actions workflow

- needs a bit more debugging but is close to working
---
 .github/workflows/test.yml | 123 +++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..2df30efb
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,123 @@
+---
+#based on https://raw.githubusercontent.com/ckan/ckanext-scheming/master/.github/workflows/test.yml
+# alternative https://github.com/ckan/ckan/blob/master/contrib/cookiecutter/ckan_extension/%7B%7Bcookiecutter.project%7D%7D/.github/workflows/test.yml
+name: Tests
+on: [push, pull_request]
+env:
+  CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test
+  CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test
+  CKAN_DATASTORE_READ_URL: postgresql://datastore_read:pass@postgres/datastore_test
+  CKAN_SOLR_URL: http://solr:8983/solr/ckan
+  CKAN_REDIS_URL: redis://redis:6379/1
+jobs:
+
+
+
+
+  lint:
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.6'
+      - name: Install requirements
+        run: pip install flake8 pycodestyle
+      - name: Check syntax
+        run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan
+
+  test:
+    needs: lint
+    strategy:
+      matrix:
+        # not ready for CKAN 2.9 yet
+        # ckan-version: [2.9, 2.9-py2, 2.8, 2.7]
+        ckan-version: [2.8, 2.7]
+        env:
+          - { ARCHIVER_GIT_REPO: "ckan", ARCHIVER_BRANCH: "master", REPORT_GIT_REPO: "datagovuk", REPORT_BRANCH: "master" }
+          - { ARCHIVER_GIT_REPO: "qld-gov-au", ARCHIVER_BRANCH: "2.1.0-qgov.1", REPORT_GIT_REPO: "qld-gov-au", REPORT_BRANCH: "0.1" }
+          - { ARCHIVER_GIT_REPO: "qld-gov-au", ARCHIVER_BRANCH: "develop", REPORT_GIT_REPO: "qld-gov-au", REPORT_BRANCH: "develop" }
+      fail-fast: false
+
+    name: CKAN ${{ matrix.ckan-version }}
+    runs-on: ubuntu-18.04
+    container:
+      image: openknowledge/ckan-dev:${{ matrix.ckan-version }}
+    services:
+      solr:
+        image: ckan/ckan-solr-dev:${{ matrix.ckan-version }}
+      postgres:
+        image: ckan/ckan-postgres-dev:${{ matrix.ckan-version }}
+        env:
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: postgres
+        options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
+      redis:
+          image: redis:3
+    env: ${{ matrix.env }}
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Install report and archiver plugins
+      run: |
+        echo "Installing dependency ckanext-report and its requirements..."
+        if [ ! -d ckanext-report ]; then
+          git clone --depth=50 --branch=$REPORT_BRANCH https://github.com/$REPORT_GIT_REPO/ckanext-report ckanext-report
+        fi
+        cd ckanext-report
+          if [ -f pip-requirements.txt ]; then
+            pip install -r pip-requirements.txt
+          fi
+          if [ -f dev-requirements.txt ]; then
+            pip install -r dev-requirements.txt
+          fi
+
+          if [ -f requirements.txt ]; then
+            pip install -r requirements.txt
+          fi
+          pip install --no-deps -e .
+        cd ..
+
+        echo "Installing dependency ckanext-archiver and its requirements..."
+        if [ ! -d ckanext-archiver ]; then
+          git clone --depth=50 --branch=$ARCHIVER_BRANCH https://github.com/$ARCHIVER_GIT_REPO/ckanext-archiver ckanext-archiver
+        fi
+        cd ckanext-archiver
+          if [ -f pip-requirements.txt ]; then
+            pip install -r pip-requirements.txt
+          fi
+          if [ -f dev-requirements.txt ]; then
+            pip install -r dev-requirements.txt
+          fi
+
+          if [ -f requirements.txt ]; then
+            pip install -r requirements.txt
+          fi
+          pip install --no-deps -e .
+        cd ..
+
+    - name: Install requirements
+      run: |
+        pip install -r dev-requirements.txt
+        pip install -r pip-requirements.txt
+        pip install -r requirements.txt
+        pip install -e .
+        apk add file
+        # Replace default path to CKAN core config file with the one on the container
+        sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
+
+    - name: Setup extension (CKAN >= 2.9)
+      if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' }}
+      run: |
+        ckan -c test.ini db init
+    - name: Setup extension (CKAN < 2.9)
+      if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' }}
+      run: |
+        paster --plugin=ckan db init -c test.ini
+    - name: Run all tests
+      run: |
+        nosetests --with-pylons=test.ini --with-coverage --cover-package=ckanext.qa --cover-inclusive --cover-erase --cover-tests
+
+

From a0a4ba3467903297f90e353dbe2bf43731515539 Mon Sep 17 00:00:00 2001
From: ThrawnCA <shell_layer-github@yahoo.com.au>
Date: Mon, 8 Mar 2021 13:20:36 +1000
Subject: [PATCH 28/28] only install from pip-requirements file if present

---
 .github/workflows/test.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2df30efb..311db9ab 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -101,7 +101,9 @@ jobs:
     - name: Install requirements
       run: |
         pip install -r dev-requirements.txt
-        pip install -r pip-requirements.txt
+        if [ -f pip-requirements.txt ]; then
+          pip install -r pip-requirements.txt
+        fi
         pip install -r requirements.txt
         pip install -e .
         apk add file