binary-array-ld · jyucsiro · Sep 20, 2017 · Sep 20, 2017 · Sep 20, 2017 · Sep 27, 2017
diff --git a/lib/bald/__init__.py b/lib/bald/__init__.py
@@ -675,8 +675,10 @@ def load(afilepath):
         loader = netCDF4.Dataset
     else:
         raise ValueError('filepath suffix not supported: {}'.format(afilepath))
-    if not os.path.exists(afilepath):
-        raise IOError('{} not found'.format(afilepath))
+    #Disable this check for now to allow URL input
+    #TODO: Add feature to check both local files and files on the web, e.g. URLs
+    #if not os.path.exists(afilepath):
+    #    raise IOError('{} not found'.format(afilepath))
     try:
         f = loader(afilepath, "r")
         yield f

diff --git a/nc2rdf/README.md b/nc2rdf/README.md
@@ -31,4 +31,32 @@ $ python nc2rdf.py -o ttl myfile.nc
 $ python nc2rdf.py -o xml myfile.nc
 ```
 
+## nc2schemaorg
+
+This feature provides users a way to create schema.org descriptions from
+ACDD/CF/NUG conformant values in a nc file.
+
+```
+$ python nc2rdf.py -o json-ld --schema-org [cdl or nc file]
+```
+
+Example:
+```
+$ python nc2rdf.py -o json-ld --schema-org ../lib/bald/tests/integration/CDL/trajectoryProfile_template.cdl
+```
+
+
 Note: This command-line tool is experimental and is subject to changes, however serves as a prototype for accessing bald functions for netCDF related files to RDF.
+
+
+# thredds2rdf
+
+This tool allows users to input a THREDDS endpoint or THREDDS catalog.xml and get a set of RDF graphs returned for every nc file found.
+
+Example:
+```
+$ python thredds2rdf.py http://example.org/thredds
+$ python thredds2rdf.py http://example.org/thredds/catalog.xml
+```
+
+Output will be emitted to the `rdf` directory
diff --git a/nc2rdf/bald2schemaorg_mappings.json b/nc2rdf/bald2schemaorg_mappings.json
@@ -0,0 +1,8 @@
+[
+   { "bald" : "summary", "schemaorg": "description" }, 
+   { "bald" : "title", "schemaorg": "name" }, 
+   { "bald" : "id", "schemaorg": "identifier" }, 
+   { "bald" : "keywords", "schemaorg": "keywords" }, 
+   { "bald" : "license", "schemaorg": "license" }, 
+   { "bald" : "standard_name", "schemaorg": "variableMeasured" }
+]
diff --git a/nc2rdf/nc2rdf.py b/nc2rdf/nc2rdf.py
@@ -5,13 +5,123 @@
 import netCDF4
 import numpy as np
 import bald
+import rdflib
+import json
+from rdflib import Namespace, BNode, URIRef, Literal
+from rdflib.namespace import RDF
+try:
+    # python 3
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
 
-def nc2rdf(ncfilename, outformat, baseuri=None):  
-    #print("nc2rdf test")
-    #print(ncfile)
+def isUrl(url):
+    try:
+        result = urlparse(url)
+        if all([result.scheme, result.netloc, result.path]) and (result.scheme == 'https' or result.scheme == 'http'):
+           return True
+    except:
+        return False
+
+def getBasename(urlstr):
+   return os.path.basename(urlstr)
+
+def baldgraph2schemaorg(graph, path=None, baseuri=None):
+    """
+       Input: netCDF file
+       Transforms to a rdflib.Graph bald style
+       Returns a new graph in schema.org profile
+    """
+    # HACK: The following mappings ignore prefixes as well as prefixes in nc file
+    # TODO: Fix references to prefixes/aliases proper
+
+    #load mappings
+    mapping_idx = {}
+    mapping_data = []
+    with open('bald2schemaorg_mappings.json' , 'r') as f:
+       mapping_data = json.load(f)
+
+    for item in mapping_data:
+       mapping_idx[item['bald']] = item['schemaorg']
+
+    qres = graph.query(
+    """PREFIX bald: <http://binary-array-ld.net/latest/> 
+       SELECT DISTINCT ?pred ?value
+       WHERE {
+          ?c a bald:Container .
+          ?c ?pred ?value
+       }""")
+
+    schema_g = rdflib.Graph()
+
+    if baseuri is not None:
+       container = URIRef(baseuri)
+    else:
+       container = BNode()
+
+    so = Namespace("http://schema.org/")
+    schema_g.add( (container, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), so.Dataset) )
+
+    if path is not None and isUrl(path):
+       predUri = URIRef("http://schema.org/url")
+       schema_g.add( (container, predUri, URIRef(path)) )
+
+    for row in qres:
+       currField = getBasename(str(row[0])).strip()
+       #print(getBasename(str(row[0])) + ' (type: ' + str(type(row[0])) + ")" + " :: " + row[1] + ' (type: ' + str(type(row[1])) + ")")
+       if(currField in mapping_idx.keys()):
+          predUri = URIRef("http://schema.org/" + mapping_idx[currField])
+          if currField == 'keywords':
+             for x in row[1].split(','):
+                kw = x.strip()
+                if len(kw) == 0:
+                   continue
+                lit = Literal(kw)
+                schema_g.add( (container, predUri, lit) )
+             continue
+
+          #print('schemaorg:' + mapping_idx[currField], "\t", row[1])
+          lit = Literal(row[1])
+          schema_g.add( (container, predUri, lit) )
+    return schema_g
+
+def nc2schemaorg(ncfilename, outformat, outputfile=None, baseuri=None):
     root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
-    ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
-    print(ttl)
+    graph = root_container.rdfgraph()
+    schema_g = baldgraph2schemaorg(graph, path=ncfilename, baseuri=baseuri)
+    destination = None
+    if outputfile is not None:
+       destination = outputfile
+    if(outformat == 'json-ld') and destination is not None:
+       context = "http://schema.org/"
+       #s = schema_g.serialize(destination=destination, format=outformat, context=context, indent=4).decode("utf-8")
+       s = schema_g.serialize(destination=destination, format=outformat, context=context, indent=4)
+    elif outformat == 'json-ld' and destination is None:
+       context = "http://schema.org/"
+       s = schema_g.serialize(destination=destination, format=outformat, context=context, indent=4).decode("utf-8")
+       print(s)
+    else:
+       #s = schema_g.serialize(destination=destination, format=outformat).decode("utf-8")
+       s = schema_g.serialize(destination=destination, format=outformat)
+
+def nc2rdf(ncfilename, outformat, outputfile=None, baseuri=None):  
+    root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
+    if(outputfile is None):
+       #ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
+       ttl = root_container.rdfgraph().serialize(format=outformat)
+       print(ttl)
+    else:
+       #ttl = root_container.rdfgraph().serialize(destination=outputfile, format=outformat).decode("utf-8")
+       ttl = root_container.rdfgraph().serialize(destination=outputfile, format=outformat)
+
+
+def cdl2schemaorg(cdl_file, outformat, baseuri=None): 
+    tfile, tfilename = tempfile.mkstemp('.nc')
+    subprocess.check_call(['ncgen', '-o', tfilename, cdl_file])
+    schema_g = nc2schemaorg(tfilename, outformat, baseuri=baseuri)
+    os.close(tfile)
+    os.remove(tfilename)
+    return schema_g
 
 def cdl2rdf(cdl_file, outformat, baseuri=None): 
     #print("cdl2rdf test")
@@ -32,13 +142,20 @@ def cdl2rdf(cdl_file, outformat, baseuri=None):
     parser.add_argument('--baseuri', action="store", dest="baseuri", help="Base URI for the graph")
     parser.add_argument('--cdl', action="store_true", dest="isCDL", default=False, help="Flag to indicate file is CDL")
     parser.add_argument('--nc', action="store_true", dest="isNC", default=False, help="Flag to indicate file is netCDF")
+    parser.add_argument('--schema-org', action="store_true", dest="isSchemaOrgOutput", default=False, help="Flag to indicate if schema.org output activated")
     parser.add_argument("ncfile", help="Path for the netCDF file")
 
     args = parser.parse_args()
 
     if(args.isCDL or args.ncfile.endswith(".cdl") or args.ncfile.endswith('.CDL')):
-        cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
+        if(args.isSchemaOrgOutput):
+           cdl2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
+        else:
+           cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
     elif(args.isNC or args.ncfile.endswith(".nc") or args.ncfile.endswith('.NC')):
-        nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
+        if(args.isSchemaOrgOutput):
+           nc2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
+        else:
+           nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
     else:
         print("Unrecognised file suffix. Please indicate if CDL or NC via --cdl or --nc");
diff --git a/nc2rdf/requirements.txt b/nc2rdf/requirements.txt
@@ -0,0 +1,3 @@
+lxml
+pydap
+urllib3
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    lxml
+    pydap
+    urllib3