Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Thredds harvest #86

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions lib/bald/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,8 +675,10 @@ def load(afilepath):
loader = netCDF4.Dataset
else:
raise ValueError('filepath suffix not supported: {}'.format(afilepath))
if not os.path.exists(afilepath):
raise IOError('{} not found'.format(afilepath))
#Disable this check for now to allow URL input
#TODO: Add feature to check both local files and files on the web, e.g. URLs
#if not os.path.exists(afilepath):
# raise IOError('{} not found'.format(afilepath))
try:
f = loader(afilepath, "r")
yield f
Expand Down
28 changes: 28 additions & 0 deletions nc2rdf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,32 @@ $ python nc2rdf.py -o ttl myfile.nc
$ python nc2rdf.py -o xml myfile.nc
```

## nc2schemaorg

This feature provides users a way to create schema.org descriptions from
ACDD/CF/NUG conformant values in a nc file.

```
$ python nc2rdf.py -o json-ld --schema-org [cdl or nc file]
```

Example:
```
$ python nc2rdf.py -o json-ld --schema-org ../lib/bald/tests/integration/CDL/trajectoryProfile_template.cdl
```


Note: This command-line tool is experimental and is subject to changes, however serves as a prototype for accessing bald functions for netCDF related files to RDF.


# thredds2rdf

This tool allows users to input a THREDDS endpoint or THREDDS catalog.xml and get a set of RDF graphs returned for every nc file found.

Example:
```
$ python thredds2rdf.py http://example.org/thredds
$ python thredds2rdf.py http://example.org/thredds/catalog.xml
```

Output will be emitted to the `rdf` directory
8 changes: 8 additions & 0 deletions nc2rdf/bald2schemaorg_mappings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
{ "bald" : "summary", "schemaorg": "description" },
{ "bald" : "title", "schemaorg": "name" },
{ "bald" : "id", "schemaorg": "identifier" },
{ "bald" : "keywords", "schemaorg": "keywords" },
{ "bald" : "license", "schemaorg": "license" },
{ "bald" : "standard_name", "schemaorg": "variableMeasured" }
]
131 changes: 124 additions & 7 deletions nc2rdf/nc2rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,123 @@
import netCDF4
import numpy as np
import bald
import rdflib
import json
from rdflib import Namespace, BNode, URIRef, Literal
from rdflib.namespace import RDF
try:
# python 3
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse

def nc2rdf(ncfilename, outformat, baseuri=None):
#print("nc2rdf test")
#print(ncfile)
def isUrl(url):
try:
result = urlparse(url)
if all([result.scheme, result.netloc, result.path]) and (result.scheme == 'https' or result.scheme == 'http'):
return True
except:
return False

def getBasename(urlstr):
return os.path.basename(urlstr)

def baldgraph2schemaorg(graph, path=None, baseuri=None):
"""
Input: netCDF file
Transforms to a rdflib.Graph bald style
Returns a new graph in schema.org profile
"""
# HACK: The following mappings ignore prefixes as well as prefixes in nc file
# TODO: Fix references to prefixes/aliases proper

#load mappings
mapping_idx = {}
mapping_data = []
with open('bald2schemaorg_mappings.json' , 'r') as f:
mapping_data = json.load(f)

for item in mapping_data:
mapping_idx[item['bald']] = item['schemaorg']

qres = graph.query(
"""PREFIX bald: <http://binary-array-ld.net/latest/>
SELECT DISTINCT ?pred ?value
WHERE {
?c a bald:Container .
?c ?pred ?value
}""")

schema_g = rdflib.Graph()

if baseuri is not None:
container = URIRef(baseuri)
else:
container = BNode()

so = Namespace("http://schema.org/")
schema_g.add( (container, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), so.Dataset) )

if path is not None and isUrl(path):
predUri = URIRef("http://schema.org/url")
schema_g.add( (container, predUri, URIRef(path)) )

for row in qres:
currField = getBasename(str(row[0])).strip()
#print(getBasename(str(row[0])) + ' (type: ' + str(type(row[0])) + ")" + " :: " + row[1] + ' (type: ' + str(type(row[1])) + ")")
if(currField in mapping_idx.keys()):
predUri = URIRef("http://schema.org/" + mapping_idx[currField])
if currField == 'keywords':
for x in row[1].split(','):
kw = x.strip()
if len(kw) == 0:
continue
lit = Literal(kw)
schema_g.add( (container, predUri, lit) )
continue

#print('schemaorg:' + mapping_idx[currField], "\t", row[1])
lit = Literal(row[1])
schema_g.add( (container, predUri, lit) )
return schema_g

def nc2schemaorg(ncfilename, outformat, outputfile=None, baseuri=None):
root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
print(ttl)
graph = root_container.rdfgraph()
schema_g = baldgraph2schemaorg(graph, path=ncfilename, baseuri=baseuri)
destination = None
if outputfile is not None:
destination = outputfile
if(outformat == 'json-ld') and destination is not None:
context = "http://schema.org/"
#s = schema_g.serialize(destination=destination, format=outformat, context=context, indent=4).decode("utf-8")
s = schema_g.serialize(destination=destination, format=outformat, context=context, indent=4)
elif outformat == 'json-ld' and destination is None:
context = "http://schema.org/"
s = schema_g.serialize(destination=destination, format=outformat, context=context, indent=4).decode("utf-8")
print(s)
else:
#s = schema_g.serialize(destination=destination, format=outformat).decode("utf-8")
s = schema_g.serialize(destination=destination, format=outformat)

def nc2rdf(ncfilename, outformat, outputfile=None, baseuri=None):
root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
if(outputfile is None):
#ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
ttl = root_container.rdfgraph().serialize(format=outformat)
print(ttl)
else:
#ttl = root_container.rdfgraph().serialize(destination=outputfile, format=outformat).decode("utf-8")
ttl = root_container.rdfgraph().serialize(destination=outputfile, format=outformat)


def cdl2schemaorg(cdl_file, outformat, baseuri=None):
tfile, tfilename = tempfile.mkstemp('.nc')
subprocess.check_call(['ncgen', '-o', tfilename, cdl_file])
schema_g = nc2schemaorg(tfilename, outformat, baseuri=baseuri)
os.close(tfile)
os.remove(tfilename)
return schema_g

def cdl2rdf(cdl_file, outformat, baseuri=None):
#print("cdl2rdf test")
Expand All @@ -32,13 +142,20 @@ def cdl2rdf(cdl_file, outformat, baseuri=None):
parser.add_argument('--baseuri', action="store", dest="baseuri", help="Base URI for the graph")
parser.add_argument('--cdl', action="store_true", dest="isCDL", default=False, help="Flag to indicate file is CDL")
parser.add_argument('--nc', action="store_true", dest="isNC", default=False, help="Flag to indicate file is netCDF")
parser.add_argument('--schema-org', action="store_true", dest="isSchemaOrgOutput", default=False, help="Flag to indicate if schema.org output activated")
parser.add_argument("ncfile", help="Path for the netCDF file")

args = parser.parse_args()

if(args.isCDL or args.ncfile.endswith(".cdl") or args.ncfile.endswith('.CDL')):
cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
if(args.isSchemaOrgOutput):
cdl2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
else:
cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
elif(args.isNC or args.ncfile.endswith(".nc") or args.ncfile.endswith('.NC')):
nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
if(args.isSchemaOrgOutput):
nc2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
else:
nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
else:
print("Unrecognised file suffix. Please indicate if CDL or NC via --cdl or --nc");
3 changes: 3 additions & 0 deletions nc2rdf/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
lxml
pydap
urllib3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this dependency doesn't appear to be used anywhere

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

threddsnc2rdf.py uses the urljoin/split/parse functions of urllib3/urllib.

Loading