Merge branch '138-vocabulary-terms' into 'dev.publicamundi.eu'

PublicaMundi · Jun 7, 2015 · 66db577 · 66db577
2 parents c84a49b + d533046
commit 66db577
Show file tree

Hide file tree

Showing 13 changed files with 164 additions and 76 deletions.
diff --git a/ckanext/publicamundi/controllers/api.py b/ckanext/publicamundi/controllers/api.py
@@ -121,15 +121,15 @@ def vocabulary_get(self, name):
         name = str(name)
         r = None
 
-        vocab = vocabularies.get_by_name(name)
+        vocab = vocabularies.get_by_name(name) 
         if vocab:
-            terms = vocab['vocabulary'].by_value
             r = {
                 'date_type': vocab.get('date_type'),
                 'reference_date': vocab.get('reference_date'),
                 'title': vocab.get('title'),
                 'name': vocab.get('name'),
-                'terms': [{ 'value': k, 'title': terms[k].title } for k in terms],
+                'terms': [{'token': t.token, 'value': t.value, 'title': t.title} 
+                    for t in vocab['vocabulary']],
             }
 
         response.headers['Content-Type'] = content_types['json']
@@ -149,7 +149,7 @@ def dataset_export(self, name_or_id):
         return
 
     def dataset_import(self):
-
+        
         post = request.params
 
         # Forward to the dataset_import action

diff --git a/ckanext/publicamundi/lib/metadata/base.py b/ckanext/publicamundi/lib/metadata/base.py
@@ -58,9 +58,9 @@ def flatten_field(field):
             'Only zope.schema.Choice supported for key_type'
         res = {}
         res1 = flatten_field(field.value_type)
-        for v in field.key_type.vocabulary:
+        for t in field.key_type.vocabulary:
             for k1, field1 in res1.items():
-                res[(v.token,) + k1] = field1
+                res[(t.value,) + k1] = field1
     else:
         res = { (): field }
 

diff --git a/ckanext/publicamundi/lib/metadata/schemata/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/schemata/inspire_metadata.py
@@ -119,6 +119,15 @@ def check_keywords(obj):
                 raise zope.interface.Invalid(
                     'You need to select at least one keyword from INSPIRE data themes')
 
+    free_keywords = zope.schema.List(
+            title= u'Free Keywords',
+            description = u"The keyword value is a commonly used word, formalised word or phrase used to describe the subject. While the topic category is too coarse for detailed queries, keywords help narrowing a full text search and they allow for structured keyword search.",
+            required = False,
+            max_length = 10,
+            value_type = zope.schema.Object(IFreeKeyword,
+                title = u'Free Keyword'))
+    free_keywords.setTaggedValue('format:markup', { 'descend-if-dictized': False })
+
     # Geographic
 
     bounding_box = zope.schema.List(

diff --git a/ckanext/publicamundi/lib/metadata/types/_common.py b/ckanext/publicamundi/lib/metadata/types/_common.py
@@ -68,6 +68,17 @@ class FreeKeyword(Object):
     reference_date = None
     date_type = None
 
+    @classmethod
+    def normalize_keyword(cls, s):
+        from inflection import dasherize, underscore
+        return dasherize(underscore(unicode(s)))
+
+    def __init__(self, **kwargs):
+        value = kwargs.get('value')
+        if value:
+            kwargs['value'] = self.normalize_keyword(value)
+        super(FreeKeyword, self).__init__(**kwargs)
+
 @object_null_adapter()
 class GeographicBoundingBox(Object):
 

diff --git a/ckanext/publicamundi/lib/metadata/types/baz.py b/ckanext/publicamundi/lib/metadata/types/baz.py
@@ -7,7 +7,7 @@
 from ckanext.publicamundi.lib.metadata.types import Thesaurus, ThesaurusTerms
 from ckanext.publicamundi.lib.metadata.types._common import *
 
-thesaurus_gemet_themes = Thesaurus.make('keywords-gemet-themes')
+thesaurus_gemet_themes = Thesaurus.lookup('keywords-gemet-themes')
 
 class KeywordsFactory(object):
 

diff --git a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py
@@ -1,5 +1,7 @@
 import os
+import re
 import uuid
+import datetime
 import zope.interface
 import zope.schema
 from zope.schema.vocabulary import SimpleVocabulary
@@ -17,6 +19,8 @@
 from ckanext.publicamundi.lib.metadata.types.thesaurus import Thesaurus, ThesaurusTerms
 from ckanext.publicamundi.lib.metadata.types._common import *
 
+strptime = datetime.datetime.strptime
+
 class KeywordsFactory(object):
 
     __slots__ = ('_name',)
@@ -27,7 +31,7 @@ def __init__(self, thesaurus_name='keywords-gemet-inspire-themes'):
     def __call__(self):
         keywords = {}
         keywords[self._name] = ThesaurusTerms(
-            terms=[], thesaurus=Thesaurus.make(self._name))
+            terms=[], thesaurus=Thesaurus.lookup(self._name))
         return keywords
 
 class TemporalExtentFactory(object):
@@ -63,6 +67,7 @@ class InspireMetadata(BaseMetadata):
     topic_category = list
 
     keywords = KeywordsFactory()
+    free_keywords = list
 
     bounding_box = list
 
@@ -143,17 +148,13 @@ def to_xml(self, o=None, nsmap=None):
         return e
 
     def from_xml(self, e):
-        '''Build and return an InspireMetadata object serialized as an etree
-        Element e.
+        '''Build and return an InspireMetadata object from a (serialized) etree Element e.
         '''
 
-        def to_date(string):
-            if isinstance(string, str):
-                return datetime.datetime.strptime(string,'%Y-%m-%d').date()
-            else:
-                return None
+        def to_date(s):
+            return strptime(s, '%Y-%m-%d').date() if isinstance(s, str) else None
 
-        def to_resp_party(alist):
+        def to_responsible_party(alist):
             result = []
             for it in alist:
                 result.append(ResponsibleParty(
@@ -162,6 +163,8 @@ def to_resp_party(alist):
                     role = it.role))
             return result
 
+        # Parse object
+
         md = MD_Metadata(e)
 
         datestamp = to_date(md.datestamp)
@@ -176,25 +179,46 @@ def to_resp_party(alist):
         for topic in md.identification.topiccategory:
             topic_list.append(topic)
 
-        keywords_dict = {}
+        free_keywords = []
+        keywords = {}
         for it in md.identification.keywords:
             thes_title = it['thesaurus']['title']
-            if thes_title is not None:
-                thes_split = thes_title.split(',')
-                # TODO thes_split[1] (=version) can be used in a get_by_title_and_version() 
-                # to enforce a specific thesaurus version.
-                thes_title = thes_split[0]
+            # Lookup and instantiate a named thesaurus
+            thes = None
+            if thes_title:
                 try:
-                    thes_name = vocabularies.munge('Keywords-' + thes_title)
-                    term_list = []
-                    for t in it['keywords']:
-                        term_list.append(t)
-                    thes = Thesaurus.make(thes_name)
-                    if thes:
-                        kw = ThesaurusTerms(thesaurus=thes, terms=term_list)
-                        keywords_dict.update({thes_name:kw})
+                    thes_title, thes_version = thes_title.split(',')
                 except:
-                    pass
+                    thes_version = None
+                else:
+                    thes_version = re.sub(r'^[ ]*version[ ]+(\d\.\d)$', r'\1', thes_version)
+                # Note thes_version can be used to enforce a specific thesaurus version
+                try:
+                    thes = Thesaurus.lookup(title=thes_title, for_keywords=True)
+                except ValueError:
+                    thes = None
+            # Treat present keywords depending on if they belong to a thesaurus
+            if thes:
+                # Treat as thesaurus terms; discard unknown terms
+                terms = []
+                for keyword in it['keywords']:
+                    term = thes.vocabulary.by_value.get(keyword)
+                    if not term:
+                        term = thes.vocabulary.by_token.get(keyword)
+                    if term:
+                        terms.append(term.value)
+                keywords[thes.name] = ThesaurusTerms(thesaurus=thes, terms=terms)
+            else:
+                # Treat as free keywords (not really a thesaurus)
+                vocab_date = to_date(it['thesaurus']['date'])
+                vocab_datetype = it['thesaurus']['datetype']
+                for keyword in it['keywords']:
+                    free_keywords.append(FreeKeyword(
+                        value = keyword,
+                        reference_date = vocab_date,
+                        date_type = vocab_datetype,
+                        originating_vocabulary = thes_title))
+
         temporal_extent = []
         if md.identification.temporalextent_start or md.identification.temporalextent_end:
             temporal_extent = [TemporalExtent(
@@ -222,13 +246,6 @@ def to_resp_party(alist):
             elif it.type == 'revision':
                 revision_date = to_date(it.date)
 
-        #if not creation_date:
-        #    raise Exception('creation date not present','')
-        #elif not publication_date:
-        #    raise Exception('publication date not present','')
-        #elif not revision_date:
-        #    raise Exception('revision date not present','')
-
         spatial_list = []
 
         if len(md.identification.distance) != len(md.identification.uom):
@@ -291,7 +308,7 @@ def to_resp_party(alist):
 
         obj = InspireMetadata()
 
-        obj.contact = to_resp_party(md.contact)
+        obj.contact = to_responsible_party(md.contact)
         obj.datestamp = datestamp
         obj.languagecode = md.languagecode
         obj.title = unicode(md.identification.title)
@@ -300,7 +317,8 @@ def to_resp_party(alist):
         obj.locator = url_list
         #obj.resource_language = md.identification.resourcelanguage
         obj.topic_category = topic_list
-        obj.keywords = keywords_dict
+        obj.keywords = keywords
+        obj.free_keywords = free_keywords
         obj.bounding_box = bbox
         obj.temporal_extent = temporal_extent
         obj.creation_date = creation_date
@@ -311,7 +329,7 @@ def to_resp_party(alist):
         obj.conformity = conf_list
         obj.access_constraints = limit_list
         obj.limitations = constr_list
-        obj.responsible_party = to_resp_party(md.identification.contact)
+        obj.responsible_party = to_responsible_party(md.identification.contact)
 
         return obj
 
diff --git a/ckanext/publicamundi/lib/metadata/types/thesaurus.py b/ckanext/publicamundi/lib/metadata/types/thesaurus.py
@@ -22,31 +22,40 @@ class Thesaurus(Object):
 
     @property
     def vocabulary(self):
-        spec = vocabularies.get_by_name(self.name)
-        return spec.get('vocabulary') if spec else None
+        vocab = vocabularies.get_by_name(self.name)
+        return vocab.get('vocabulary') if vocab else None
 
     # Factory for Thesaurus
 
     @classmethod
-    def make(cls, name):
-        '''Create a new Thesaurus instance from it's machine-name name.
-        The metadata for this thesaurus are queried from vocabularies module.
+    def lookup(cls, name=None, title=None, for_keywords=False):
+        '''Lookup by name or title and return a Thesaurus instance.
 
-        Note: Maybe rename this class-method to lookup
+        This is a factory method that tries to instantiate a Thesaurus object
+        from a collection of well-known (mostly related to INSPIRE) vocabularies.
         '''
-        spec = vocabularies.get_by_name(name)
-        if spec:
+
+        vocab = None
+
+        if (name is None) and title:
+            name = vocabularies.normalize_thesaurus_title(title, for_keywords)
+
+        if name:
+            vocab = vocabularies.get_by_name(name)
+        else:
+            raise ValueError('Expected a name/title lookup')
+
+        if vocab:
             kwargs = {
-               'title': spec.get('title'),
-               'name': spec.get('name'),
-               'reference_date': spec.get('reference_date'),
-               'version' : spec.get('version'),
-               'date_type': spec.get('date_type'),
+               'title': vocab.get('title'),
+               'name': vocab.get('name'),
+               'reference_date': vocab.get('reference_date'),
+               'version' : vocab.get('version'),
+               'date_type': vocab.get('date_type'),
             }
             return cls(**kwargs)
         else:
-            raise ValueError(
-                'Cannot find an INSPIRE thesaurus named "%s"' %(name))
+            raise ValueError('Cannot find a thesaurus named "%s"' %(name))
 
 @object_null_adapter()
 class ThesaurusTerms(Object):

diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py b/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py
@@ -8,15 +8,13 @@
 
 # Import loader
 
-from ckanext.publicamundi.lib.metadata.vocabularies import json_loader
-
-munge = json_loader.munge
+from ckanext.publicamundi.lib.metadata.vocabularies.json_loader import (
+    make_vocabularies, normalize_keyword, normalize_thesaurus_title)
 
 def _update(data_file, name_prefix='', overwrite=False):
     '''Update the module-global vocabularies from external JSON data.
     '''
-
-    for name, desc in json_loader.make_vocabularies(data_file):
+    for name, desc in make_vocabularies(data_file):
         assert overwrite or not (name in vocabularies), (
             'A vocabulary named %r is allready loaded' % (name))
         vocabularies[name_prefix + name] = desc

diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/babel_extractors.py b/ckanext/publicamundi/lib/metadata/vocabularies/babel_extractors.py
@@ -8,7 +8,7 @@
 # Babel string extraction functions
 
 def extract_json(fileobj, keywords, comment_tags, options):
-    """Extract messages from XXX files.
+    """Extract messages from files.
     :param fileobj: the file-like object the messages should be extracted from
     :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions
     :param comment_tags: a list of translator tags to search for and include in the results