Skip to content

Commit

Permalink
chore: move commands to common and cross referfence in cli also
Browse files Browse the repository at this point in the history
  • Loading branch information
duttonw committed Dec 11, 2024
1 parent 8550942 commit 8755a78
Show file tree
Hide file tree
Showing 3 changed files with 344 additions and 266 deletions.
81 changes: 74 additions & 7 deletions ckanext/validation/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import click

from ckanext.validation.model import create_tables, tables_exist
from ckanext.validation import common


def get_commands():
Expand All @@ -17,10 +17,77 @@ def validation():

@validation.command()
def init_db():
"""Creates the necessary tables in the database."""
if tables_exist():
print(u"Validation tables already exist")
sys.exit(0)
""" Initialize database tables.
"""
common.init_db()

create_tables()
print(u"Validation tables created")

@validation.command(name='run')
@click.option(u'-y', u'--yes',
help=u'Automatic yes to prompts. Assume "yes" as answer '
u'to all prompts and run non-interactively',
default=False)
@click.option('-r', '--resource',
multiple=True,
help=u'Run data validation on a particular resource (if the format is suitable).'
u'It can be defined multiple times. Not to be used with -d or -s')
@click.option('-d', '--dataset',
multiple=True,
help=u'Run data validation on all resources for a particular dataset (if the format is suitable).'
u' You can use the dataset id or name, and it can be defined multiple times. '
u'Not to be used with -r or -s')
@click.option('-s', '--search',
default=False,
help=u'Extra search parameters that will be used for getting the datasets to run '
u'validation on. It must be a JSON object like the one used by the `package_search` API call.'
u' Supported fields are `q`, `fq` and `fq_list`. Check the documentation for examples. '
u'Note that when using this you will have to specify the resource formats to target yourself.'
u' Not to be used with -r or -d.')
def run_validation(yes, resource, dataset, search):
'''Start asynchronous data validation on the site resources. If no
options are provided it will run validation on all resources of
the supported formats (`ckanext.validation.formats`). You can
specify particular datasets to run the validation on their
resources. You can also pass arbitrary search parameters to filter
the selected datasets.
'''
common.run_validation(yes, resource, dataset, search)


@validation.command()
@click.option(u'-o', u'--output',
help=u'Location of the CSV validation report file on the relevant commands.',
default=u'validation_errors_report.csv')
def report(output):
'''Generate a report with all current data validation reports. This
will print an overview of the total number of tabular resources
and a breakdown of how many have a validation status of success,
failure or error. Additionally it will create a CSV report with all
failing resources, including the following fields:
* Dataset name
* Resource id
* Resource URL
* Status
* Validation report URL
'''
common.report(output)


@validation.command(name='report-full')
@click.option(u'-o', u'--output',
help=u'Location of the CSV validation report file on the relevant commands.',
default=u'validation_errors_report.csv')
def report_full(output):
'''Generate a detailed report. This is similar to 'report'
but on the CSV report it will add a row for each error found on the
validation report (limited to ten occurrences of the same error
type per file). So the fields in the generated CSV report will be:
* Dataset name
* Resource id
* Resource URL
* Status
* Error code
* Error message
'''
common.report(output, full=True)
267 changes: 9 additions & 258 deletions ckanext/validation/commands.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,10 @@
# encoding: utf-8

import sys
import logging
import csv

from ckan.lib.cli import query_yes_no
from ckantoolkit import CkanCommand, get_action, config
from ckantoolkit import CkanCommand

from ckanext.validation import settings
from ckanext.validation.model import create_tables, tables_exist
from ckanext.validation.logic import _search_datasets


def error(msg):
'''
Print an error message to STDOUT and exit with return code 1.
'''
sys.stderr.write(msg)
if not msg.endswith('\n'):
sys.stderr.write('\n')
sys.exit(1)
from ckanext.validation import common


class Validation(CkanCommand):
Expand Down Expand Up @@ -111,9 +96,6 @@ def __init__(self, name):
help='''Location of the CSV validation
report file on the relevant commands.''')


_page_size = 100

def command(self):
self._load_config()

Expand All @@ -137,249 +119,18 @@ def command(self):
sys.exit(1)

def init_db(self):

if tables_exist():
print(u'Validation tables already exist')
sys.exit(0)

create_tables()

print(u'Validation tables created')
common.init_db()

def run_validation(self):

if self.options.resource_id:
for resource_id in self.options.resource_id:
resource = get_action('resource_show')({}, {'id': resource_id})
self._run_validation_on_resource(
resource['id'], resource['package_id'])
else:

query = _search_datasets()

if query['count'] == 0:
error('No suitable datasets, exiting...')

elif not self.options.assume_yes:

msg = ('\nYou are about to start validation for {0} datasets' +
'.\n Do you want to continue?')

confirm = query_yes_no(msg.format(query['count']))

if confirm == 'no':
error('Command aborted by user')

result = get_action('resource_validation_run_batch')(
{'ignore_auth': True},
{'dataset_ids': self.options.dataset_id,
'query': self.options.search_params}
)
print(result['output'])

def _run_validation_on_resource(self, resource_id, dataset_id):

log = logging.getLogger(__name__)

get_action(u'resource_validation_run')(
{u'ignore_auth': True},
{u'resource_id': resource_id,
u'async': True})

msg = ('Resource {} from dataset {} sent to ' +
'the validation queue')

log.debug(
msg.format(resource_id, dataset_id))

def _process_row(self, dataset, resource, writer):
resource_url = '{}/dataset/{}/resource/{}'.format(
config['ckan.site_url'],
dataset['name'],
resource['id'])

validation_url = resource_url + '/validation'

writer.writerow({
'dataset': dataset['name'],
'resource_id': resource['id'],
'format': resource['format'],
'url': resource_url,
'status': resource['validation_status'],
'validation_report_url': validation_url
})

return

def _process_row_full(self, dataset, resource, writer):

limit_per_error_type = 10

error_counts = {}

resource_url = '{}/dataset/{}/resource/{}'.format(
config['ckan.site_url'],
dataset['name'],
resource['id'])

# Get validation report
validation = get_action('resource_validation_show')(
{'ignore_auth': True}, {'resource_id': resource['id']})

if not validation.get('report'):
return
assume_yes = self.options.assume_yes
resource_ids = self.options.resource_id
dataset_ids = self.options.dataset_id
query = self.options.search_params

errors = validation['report']['tables'][0]['errors']

for error in errors:
if not error['code'] in error_counts:
error_counts[error['code']] = 1
else:
error_counts[error['code']] += 1

if error_counts[error['code']] > limit_per_error_type:
continue

writer.writerow({
'dataset': dataset['name'],
'resource_id': resource['id'],
'format': resource['format'],
'url': resource_url,
'status': resource['validation_status'],
'error_code': error['code'],
'error_message': error['message']
})

return error_counts
common.run_validation(assume_yes, resource_ids, dataset_ids, query)

def report(self, full=False):

log = logging.getLogger(__name__)

output_csv = self.options.output_file
if output_csv == 'validation_errors_report.csv' and full:
output_csv = 'validation_errors_report_full.csv'

outputs = {
'tabular_resources': 0,
'resources_failure': 0,
'resources_error': 0,
'resources_success': 0,
'datasets': 0,
'formats_success': {},
'formats_failure': {}
}
error_counts = {}

with open(output_csv, 'w') as fw:
if full:
fieldnames = [
'dataset', 'resource_id', 'format', 'url',
'status', 'error_code', 'error_message']
else:
fieldnames = [
'dataset', 'resource_id', 'format', 'url',
'status', 'validation_report_url']

writer = csv.DictWriter(fw, fieldnames=fieldnames)
writer.writeheader()

page = 1
while True:
query = _search_datasets(page)

if page == 1 and query['count'] == 0:
error('No suitable datasets, exiting...')

if query['results']:
for dataset in query['results']:

if not dataset.get('resources'):
continue

for resource in dataset['resources']:

if (not resource['format'].lower() in
settings.DEFAULT_SUPPORTED_FORMATS):
continue

outputs['tabular_resources'] += 1

if resource.get('validation_status'):
outputs['resources_' + resource['validation_status']] += 1

if resource.get('validation_status') in (
'failure', 'error'):
if full:
row_counts = self._process_row_full(dataset, resource, writer)
if not row_counts:
continue
for code, count in row_counts.iteritems():
if code not in error_counts:
error_counts[code] = count
else:
error_counts[code] += count
else:
self._process_row(dataset, resource, writer)

if resource['format'] in outputs['formats_failure']:
outputs['formats_failure'][resource['format']] += 1
else:
outputs['formats_failure'][resource['format']] = 1
else:
if resource['format'] in outputs['formats_success']:
outputs['formats_success'][resource['format']] += 1
else:
outputs['formats_success'][resource['format']] = 1


if len(query['results']) < self._page_size:
break

page += 1
else:
break

outputs['datasets'] = query['count']
outputs['output_csv'] = output_csv

outputs['formats_success_output'] = ''
for count, code in sorted([(v, k) for k, v in outputs['formats_success'].iteritems()], reverse=True):
outputs['formats_success_output'] += '* {}: {}\n'.format(code, count)

outputs['formats_failure_output'] = ''
for count, code in sorted([(v, k) for k, v in outputs['formats_failure'].iteritems()], reverse=True):
outputs['formats_failure_output'] += '* {}: {}\n'.format(code, count)

error_counts_output = ''
if full:
for count, code in sorted([(v, k) for k, v in error_counts.iteritems()], reverse=True):
error_counts_output += '* {}: {}\n'.format(code, count)

outputs['error_counts_output'] = error_counts_output

msg_errors = '''
Errors breakdown:
{}
'''.format(outputs['error_counts_output'])

outputs['msg_errors'] = msg_errors if full else ''

msg = '''
Done.
{datasets} datasets with tabular resources
{tabular_resources} tabular resources
{resources_success} resources - validation success
{resources_failure} resources - validation failure
{resources_error} resources - validation error
Formats breakdown (validation passed):
{formats_success_output}
Formats breakdown (validation failed or errored):
{formats_failure_output}
{msg_errors}
CSV Report stored in {output_csv}
'''.format(**outputs)


log.info(msg)
common.report(output_csv, full)
Loading

0 comments on commit 8755a78

Please sign in to comment.