-
Notifications
You must be signed in to change notification settings - Fork 0
/
paituli_to_stac.py
462 lines (396 loc) · 21.4 KB
/
paituli_to_stac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
import pystac
import psycopg2
import rasterio
import requests
import datetime
import getpass
import argparse
import re
import os
import json
from rio_stac.stac import create_stac_item
from shapely.geometry import GeometryCollection, shape
from bs4 import BeautifulSoup
from utils.paituli import recursive_filecheck, generate_item_id, generate_timestamps
online_data_prefix = "https://www.nic.funet.fi/index/geodata/"
puhti_data_prefix = "/appl/data/geo/"
media_types = {
"TIFF": {
"mime": "image/tiff; application=geotiff",
"ext": "tif"
},
"PNG": {
"mime": "image/png",
"ext": "png"
},
"JPEG2000": {
"mime": "image/jp2",
"ext": "jp2"
},
"NetCDF": {
"mime": "application/x-netcdf",
"ext": "nc"
}
}
def create_collection(catalog, data_dict) -> pystac.Collection:
if "-" in data_dict["year"]:
years = data_dict["year"].split("-")
if years[1] == ">": # If the dataset is updated regularly, set the year as the first instance and update from items afterwards
temporal_extent = pystac.TemporalExtent([(
datetime.datetime.strptime(f"{years[0]}-01-01", "%Y-%m-%d"),
datetime.datetime.strptime(f"{years[0]}-12-31", "%Y-%m-%d")
)])
else:
# Remove the years in parenthesis
if len(years[1]) > 4:
temporal_extent = pystac.TemporalExtent([(
datetime.datetime.strptime(f"{years[0]}-01-01", "%Y-%m-%d"),
datetime.datetime.strptime(f"{years[1].split('(')[0].rstrip()}-12-31", "%Y-%m-%d")
)])
else:
temporal_extent = pystac.TemporalExtent([(
datetime.datetime.strptime(f"{years[0]}-01-01", "%Y-%m-%d"),
datetime.datetime.strptime(f"{years[1]}-12-31", "%Y-%m-%d")
)])
else:
temporal_extent = pystac.TemporalExtent([(
datetime.datetime.strptime(f"{data_dict['year']}-01-01", "%Y-%m-%d"),
datetime.datetime.strptime(f"{data_dict['year']}-12-31", "%Y-%m-%d")
)])
collection_scale = data_dict['scale'].rstrip()
# Include newest in the Title and Description if the collection is has the newest keyword
if "newest" in data_dict["stac_id"]:
collection_description = f"{data_dict['name_eng']} newest. Provided by {data_dict['org_eng']}. Scale: {collection_scale}. Coordinate systems: {data_dict['coord_sys']}."
collection_title = f"{data_dict['name_eng']} newest, {collection_scale} (Paituli)"
else:
if data_dict["stac_id"] == "nls_topographic_map_100k_at_paituli" and data_dict["name_eng"] == "Sverige-Finland map": #Dirty fix for the newest dataset having a different name
collection_description = f"Basic or topographic map. Provided by {data_dict['org_eng']}. Scale: {collection_scale}. Coordinate systems: {data_dict['coord_sys']}."
collection_title = f"Basic or topographic map, {collection_scale} (Paituli)"
else:
collection_description = f"{data_dict['name_eng']}. Provided by {data_dict['org_eng']}. Scale: {collection_scale}. Coordinate systems: {data_dict['coord_sys']}."
collection_title = f"{data_dict['name_eng']}, {collection_scale} (Paituli)"
collection = pystac.Collection(
id = data_dict["stac_id"],
license = "CC-BY-4.0",
title = collection_title,
description = collection_description,
extent = pystac.Extent(
spatial = pystac.SpatialExtent([[0,0,0,0]]), #Placeholder extent, updated from items later
temporal = temporal_extent
),
providers = [
pystac.Provider(
name = data_dict["org_eng"],
roles = ["licensor", "producer"]
),
pystac.Provider(
name = "CSC Finland",
url = "https://www.csc.fi/",
roles = ["host"]
),
],
assets={
"meta": pystac.Asset(
href = "https://urn.fi/"+data_dict["metadata"],
title = "Metadata",
roles = ["metadata"]
)
},
extra_fields={
"scale": [data_dict["scale"]],
"coord_sys": [data_dict["coord_sys"]]
}
)
#Other datasets have the same license, Landsat has a different one
if data_dict['name_eng'] == "Landsat":
collection.license = "PDDL-1.0"
collection.add_link(
pystac.Link(
rel = "license",
target = data_dict["license_url"],
title = "License"
))
print(f"Collection made: {collection.id}")
catalog.add_child(collection)
return collection
def create_item(path, collection, data_dict, item_media_type, label) -> pystac.Item:
"""
A bunch of different scenarios for timestamps:
- If dataset has only the latest data, get the Last-Modified header from the file
- If the database year has a span of multiple years, set the timespan accordingly
- If the dataset contains monthly data, get the year and month from filename
- Else, try to get the year from the full path of the file
- Lastly, if the path does not contain the year, take the last year from the database year-field
"""
item_timestamps = generate_timestamps(path, data_dict, label)
item_id = generate_item_id(path, data_dict, item_timestamps["item_date"], label)
collection_item_ids = {item.id for item in collection.get_items()}
# There are files which have case-sensitive file-extensions
# If the default extension returns 404, switch it to uppercase
r = requests.head(path)
if r.status_code == 404:
address = os.path.dirname(path)
filename = os.path.basename(path)
current_extension = os.path.splitext(filename)[1]
new_filename = os.path.splitext(filename)[0] + current_extension.upper()
path = os.path.join(address, new_filename)
asset_id = f"{data_dict['stac_id']}_{item_media_type.lower()}"
with rasterio.open(path) as src:
asset = pystac.Asset(
href = path,
media_type = media_types[item_media_type]["mime"],
title = asset_id,
roles = ["data"],
extra_fields = {
"gsd": float(src.res[0]),
"proj:shape": src.shape,
"proj:transform": [
src.transform.a,
src.transform.b,
src.transform.c,
src.transform.d,
src.transform.e,
src.transform.f,
src.transform.g,
src.transform.h,
src.transform.i
]
}
)
if item_media_type != "NetCDF":
if src.crs:
item_epsg = src.crs.to_epsg(confidence_threshold=50)
else:
if data_dict["coord_sys"] == "ETRS-TM35FIN" or data_dict["coord_sys"] == "WGS84/ETRS-TM35FIN":
item_epsg = 3067
else:
kkj_codes = {
"kkj": 4123,
"kkj0": 3386,
"kkj1": 2391,
"kkj2": 2392,
"kkj3": 2393,
"kkj4": 2394,
"kkj5": 3387
}
if "compress95" in path:
# Take the KKJ Zone from the path
path_kkj = path.split("/")[-7]
if path_kkj in kkj_codes:
item_epsg = kkj_codes[path_kkj]
elif "thematic_rasters" in data_dict["stac_id"]:
item_epsg = kkj_codes["kkj3"]
else: # NetCDF datasets are in 3067
item_epsg = 3067
if item_id not in collection_item_ids:
item = create_stac_item(
path,
id = item_id,
assets = {
asset_id : asset
},
asset_media_type = media_types[item_media_type]["mime"],
with_proj = True
)
item.extra_fields["gsd"] = item.assets[asset_id].extra_fields["gsd"]
item.common_metadata.start_datetime = item_timestamps["item_start_time"]
item.common_metadata.end_datetime = item_timestamps["item_end_time"]
if item.properties["proj:epsg"] == None: item.properties["proj:epsg"] = item_epsg
if item.properties["proj:epsg"] == 9391 or item.properties["proj:epsg"] == "EPSG:9391": item.properties["proj:epsg"] = 3067
collection.add_item(item)
print(f"* Item made: {item.id}")
else:
item = collection.get_item(item_id)
item.add_asset(
key = asset_id,
asset = asset
)
print(f"** Asset made for {item.id}")
return item
if __name__ == "__main__":
dir_path = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=str, help="Port for the paituli database", required=True)
parser.add_argument("--pwd", type=str, help="Password for paituli database")
parser.add_argument("--collections", nargs="+", help="Specific collections to be made")
parser.add_argument("--db_host", type=str, help="Hostname of the Paituli DB", required=True)
args = parser.parse_args()
paituli_port = args.port
selected_collections = None
if args.pwd:
paituli_pwd = args.pwd
else:
paituli_pwd = getpass.getpass("Paituli password: ")
if args.collections:
selected_collections = args.collections
try:
catalog = pystac.Catalog.from_file(f"{dir_path}/Paituli/catalog.json")
except:
catalog = pystac.Catalog("Paituli", "Paituli Catalog", catalog_type=pystac.CatalogType.RELATIVE_PUBLISHED)
conn = psycopg2.connect(f"host={args.db_host} port={paituli_port} user=paituli-ro password={paituli_pwd} dbname=paituli")
with conn.cursor() as curs:
query = "select data_id, stac_id, org_eng, name_eng, scale, year, format_eng, coord_sys, license_url, meta from dataset where access=1 and (format_eng like 'JP%' or format_eng like 'PNG%' or format_eng like 'Net%' or format_eng like 'TIFF%') order by 1, 2, 3, 5, 6;"
curs.execute(query)
datasets = {}
for result in curs:
new_dict = dict(zip(["data_id", "stac_id", "org_eng", "name_eng", "scale", "year", "format_eng", "coord_sys", "license_url", "metadata"], result))
if new_dict["stac_id"]:
datasets[new_dict["data_id"]] = {key: value for key, value in new_dict.items() if key != 'data_id'}
# Group the datasets to NetCDF and The Others
netcdf_datasets = {x : datasets[x] for x in datasets if datasets[x]['format_eng'] == "NetCDF"}
regular_datasets = {y : datasets[y] for y in datasets if y not in netcdf_datasets.keys()}
for dataset in regular_datasets:
data_dict = datasets[dataset]
stac_id = data_dict["stac_id"]
if selected_collections:
# Run with selected datasets
if stac_id not in selected_collections:
continue
catalog_collection_ids = {collection.id for collection in catalog.get_collections()}
if stac_id in catalog_collection_ids:
collection = catalog.get_child(stac_id)
# If multiple scales or coordinate systems, add them to the description
coord_pattern = r"Coordinate systems:\s*(.*?)\."
scale_pattern = r"Scale:\s*(.*?)\."
if data_dict["scale"] not in collection.extra_fields["scale"]:
match = re.search(scale_pattern, collection.description)
collection.description = collection.description.replace(match.group(1), f"{match.group(1)}, {data_dict['scale']}")
collection.extra_fields["scale"].append(data_dict["scale"])
# If multiple scales, remove scale from Title
collection.title = f"{data_dict['name_eng']} (Paituli)"
if data_dict["coord_sys"] not in collection.extra_fields["coord_sys"]:
match = re.search(coord_pattern, collection.description)
collection.description = collection.description.replace(match.group(1), f"{match.group(1)}, {data_dict['coord_sys']}")
collection.extra_fields["coord_sys"].append(data_dict["coord_sys"])
else:
collection = create_collection(catalog, data_dict)
with conn.cursor() as curs:
data = (dataset,)
query = "select gid, data_id, label, path, geom , ST_AsGeoJSON(geom) from index_wgs84 where index_wgs84.data_id=(%s)"
curs.execute(query, data)
items = {}
for i,result in enumerate(curs):
items[i] = dict(zip(["gid", "data_id", "label", "path", "geom", "geojson"], result))
item_media_type = data_dict["format_eng"].split(",")[0]
for i in items:
# Check if file path ends in a file or is the path marked with "*". Lastly if none match, the filelinks are taken via BeautifulSoup
if items[i]["path"].endswith(media_types[item_media_type]['ext']):
data_path = online_data_prefix+items[i]["path"]
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
elif items[i]["path"].endswith(".*"):
data_path = online_data_prefix+items[i]["path"].replace("*", media_types[item_media_type]["ext"])
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
elif items[i]["path"].endswith("*"):
data_path = online_data_prefix+items[i]["path"].replace("*", f".{media_types[item_media_type]['ext']}")
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
else:
# Check folder contents with BeautifulSoup
page_url = online_data_prefix+items[i]["path"]
page = requests.get(page_url)
data = page.text
soup = BeautifulSoup(data, features="html.parser")
if not items[i]["path"].endswith("/"):
item_path = items[i]["path"] + "/"
else:
item_path = items[i]["path"]
links = [link for link in soup.find_all("a")]
recursive_links = recursive_filecheck(page_url, links)
item_links = [link.get("href") for link in recursive_links if link.get("href").endswith(media_types[item_media_type]['ext'])]
if len(item_links) > 0:
for link in item_links:
data_path = online_data_prefix + item_path + link
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
# If rio-stac does not get the geometry from the file, insert it from the database using geom transformed to a GeoJSON
if stac_item.bbox == [-180.0,-90.0,180.0,90.0]:
geojson = json.loads(items[i]["geojson"])
stac_item.geometry = geojson
stac_item.bbox = pystac.utils.geometry_to_bbox(geojson)
# If rio-stac does not get the geometry from the file, insert it from the database using geom transformed to a GeoJSON
if stac_item.bbox == [-180.0,-90.0,180.0,90.0]:
geojson = json.loads(items[i]["geojson"])
stac_item.geometry = geojson
stac_item.bbox = pystac.utils.geometry_to_bbox(geojson)
bounds = [GeometryCollection([shape(s.geometry) for s in collection.get_all_items()]).bounds]
start_times = [st.common_metadata.start_datetime for st in collection.get_all_items()]
end_times = [et.common_metadata.end_datetime for et in collection.get_all_items()]
temporal = [[min(start_times), max(end_times)]]
collection.extent.spatial = pystac.SpatialExtent(bounds)
collection.extent.temporal = pystac.TemporalExtent(temporal)
for dataset in netcdf_datasets:
data_dict = datasets[dataset]
stac_id = data_dict["stac_id"]
if selected_collections:
# Run with selected datasets
if stac_id not in selected_collections:
continue
catalog_collection_ids = [collection.id for collection in catalog.get_collections()]
if stac_id in catalog_collection_ids:
collection = catalog.get_child(stac_id)
# If multiple coordinate systems, add them to the description
coord_pattern = r"Coordinate systems:\s*(.*?)\."
scale_pattern = r"Scale:\s*(.*?)\."
if data_dict["scale"] not in collection.extra_fields["scale"]:
match = re.search(scale_pattern, collection.description)
collection.description = collection.description.replace(match.group(1), f"{match.group(1)}, {data_dict['scale']}")
collection.extra_fields["scale"].append(data_dict["scale"])
# If multiple scales, remove scale from Title
collection.title = f"{data_dict['name_eng']} (Paituli)"
if data_dict["coord_sys"] not in collection.extra_fields["coord_sys"]:
match = re.search(coord_pattern, collection.description)
collection.description = collection.description.replace(match.group(1), f"{match.group(1)}, {data_dict['coord_sys']}")
collection.extra_fields["coord_sys"].append(data_dict["coord_sys"])
else:
collection = create_collection(catalog, data_dict)
with conn.cursor() as curs:
data = (dataset,)
query = "select data_id, label, path, geom , ST_AsGeoJSON(geom) from index_wgs84 where index_wgs84.data_id=(%s)"
curs.execute(query, data)
items = {}
for i,result in enumerate(curs):
items[i] = dict(zip(["data_id", "label", "path", "geom", "geojson"], result))
item_media_type = data_dict["format_eng"].split(",")[0]
for i in items:
# Check if file path ends in a file or is the path marked with "*". Lastly if none match the filelinks are taken via Beautiful Soup
if items[i]["path"].endswith(media_types[item_media_type]['ext']):
data_path = online_data_prefix+items[i]["path"]
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
elif items[i]["path"].endswith(".*"):
data_path = online_data_prefix+items[i]["path"].replace("*", media_types[item_media_type]["ext"])
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
elif items[i]["path"].endswith("*"):
data_path = online_data_prefix+items[i]["path"].replace("*", f".{media_types[item_media_type]['ext']}")
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
else:
# Check folder contents with BeautifulSoup
page_url = online_data_prefix+items[i]["path"]
page = requests.get(page_url)
data = page.text
soup = BeautifulSoup(data, features="html.parser")
if not items[i]["path"].endswith("/"):
item_path = items[i]["path"] + "/"
else:
item_path = items[i]["path"]
links = [link for link in soup.find_all("a")]
recursive_links = recursive_filecheck(page_url, links)
item_links = [link.get("href") for link in recursive_links if link.get("href").endswith(media_types[item_media_type]['ext'])]
if len(item_links) > 0:
for link in item_links:
data_path = online_data_prefix + item_path +link
stac_item = create_item(data_path, collection, data_dict, item_media_type, items[i]["label"])
# If rio-stac does not get the geometry from the file, insert it from the database using geom transformed to a GeoJSON
if stac_item.bbox == [-180.0,-90.0,180.0,90.0]:
geojson = json.loads(items[i]["geojson"])
stac_item.geometry = geojson
stac_item.bbox = pystac.utils.geometry_to_bbox(geojson)
# If rio-stac does not get the geometry from the file, insert it from the database using geom transformed to a GeoJSON
if stac_item.bbox == [-180.0,-90.0,180.0,90.0]:
geojson = json.loads(items[i]["geojson"])
stac_item.geometry = geojson
stac_item.bbox = pystac.utils.geometry_to_bbox(geojson)
bounds = [GeometryCollection([shape(s.geometry) for s in collection.get_all_items()]).bounds]
start_times = [st.common_metadata.start_datetime for st in collection.get_all_items()]
end_times = [et.common_metadata.end_datetime for et in collection.get_all_items()]
temporal = [[min(start_times), max(end_times)]]
collection.extent.spatial = pystac.SpatialExtent(bounds)
collection.extent.temporal = pystac.TemporalExtent(temporal)
catalog.normalize_and_save("Paituli", skip_unresolved=True)