Skip to content

Commit

Permalink
work in progress
Browse files Browse the repository at this point in the history
juliacollins committed Dec 24, 2024
1 parent 8f3e7dc commit a4b6c14
Showing 6 changed files with 103 additions and 13 deletions.
55 changes: 53 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ check-jsonschema = "^0.29.4"
returns = "^0.23.0"
toolz = "^1.0.0"
funcy = "^2.0"
isoduration = "^20.11.0"
[tool.poetry.group.test.dependencies]
pytest = "^8.3.2"
moto = {extras = ["all"], version = "^5.0.14"}
30 changes: 26 additions & 4 deletions src/nsidc/metgen/config.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
import logging
import os.path
from pathlib import Path
from returns.maybe import Maybe, Nothing

from nsidc.metgen import aws, constants, netcdf_reader

@@ -30,10 +31,11 @@ class Config:
checksum_type: str
number: int
dry_run: bool

def __post_init__(self):
# data_reader: Callable[[str], dict]
self.data_reader = netcdf_reader
filename_regex: Maybe[str] = Maybe.empty
time_coverage_duration: Maybe[str] = Maybe.empty
geospatial_x_resolution: Maybe[str] = Maybe.empty
geospatial_y_resolution: Maybe[str] = Maybe.empty
date_modified: Maybe[str] = Maybe.empty

def show(self):
# TODO: add section headings in the right spot
@@ -110,6 +112,11 @@ def configuration(
"checksum_type": constants.DEFAULT_CHECKSUM_TYPE,
"number": constants.DEFAULT_NUMBER,
"dry_run": constants.DEFAULT_DRY_RUN,
"filename_regex": Nothing,
"time_coverage_duration": Nothing,
"geospatial_x_resolution": Nothing,
"geospatial_y_resolution": Nothing,
"date_modified": Nothing,
}
try:
return Config(
@@ -178,6 +185,21 @@ def configuration(
_get_configuration_value(
environment, "Settings", "dry_run", bool, config_parser, overrides
),
_get_configuration_value(
environment, "Collection", "filename_regex", str, config_parser, overrides
),
_get_configuration_value(
environment, "Collection", "time_coverage_duration", str, config_parser, overrides
),
_get_configuration_value(
environment, "Collection", "geospatial_x_resolution", str, config_parser, overrides
),
_get_configuration_value(
environment, "Collection", "geospatial_y_resolution", str, config_parser, overrides
),
_get_configuration_value(
environment, "Collection", "date_modified", str, config_parser, overrides
),
)
except Exception as e:
raise Exception("Unable to read the configuration file", e)
8 changes: 5 additions & 3 deletions src/nsidc/metgen/metgen.py
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@
from returns.maybe import Maybe
from rich.prompt import Confirm, Prompt

from nsidc.metgen import aws, config, constants
from nsidc.metgen import aws, config, constants, netcdf_reader

# -------------------------------------------------------------------
CONSOLE_FORMAT = "%(message)s"
@@ -200,6 +200,7 @@ class Granule:
submission_time: Maybe[str] = Maybe.empty
uuid: Maybe[str] = Maybe.empty
cnm_message: Maybe[str] = Maybe.empty
data_reader: Callable[[str], dict] = Maybe.empty


@dataclasses.dataclass
@@ -257,8 +258,9 @@ def process(configuration: config.Config) -> None:

# Find all of the input granule files, limit the size of the list based
# on the configuration, and execute the pipeline on each of the granules.
# TODO: Nicely manage reader and glob pattern for other file types.
candidate_granules = [
Granule(p.name, data_filenames=[str(p)])
Granule(p.name, data_filenames=[str(p)], data_reader=netcdf_reader.extract_metadata)
for p in Path(configuration.data_dir).glob("*.nc")
]
granules = take(configuration.number, candidate_granules)
@@ -400,7 +402,7 @@ def create_ummg(configuration: config.Config, granule: Granule) -> Granule:
# }
metadata_details = {}
for data_file in granule.data_filenames:
metadata_details[data_file] = configuration.data_reader(data_file)
metadata_details[data_file] = granule.data_reader(data_file, configuration)

# Collapse information about (possibly) multiple files into a granule summary.
summary = metadata_summary(metadata_details)
16 changes: 13 additions & 3 deletions src/nsidc/metgen/netcdf_reader.py
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@
from nsidc.metgen import constants


def extract_metadata(netcdf_path):
def extract_metadata(netcdf_path, configuration):
# provide some sort of "review" command line function to
# assess what's missing from netcdf file?
# or add to ini file generator a step that evaluates an
@@ -24,9 +24,10 @@ def extract_metadata(netcdf_path):

return {
"size_in_bytes": os.path.getsize(netcdf_path),
# no date modified in file
"production_date_time": ensure_iso(netcdf.attrs["date_modified"]),
"production_date_time": date_modified(netcdf, configuration),
# no time range in file
# use regex to get start date from file name (assume 00:00:00 time)
# get time_coverage_duration from configuration
"temporal": time_range(netcdf),
"geometry": {"points": json.dumps(spatial_values(netcdf))},
}
@@ -53,6 +54,7 @@ def spatial_values(netcdf):
general-use module.
"""

# wkt exists in netcdf but in polar_stereographic variable (look for grid_mapping_name)
data_crs = CRS.from_wkt(netcdf.crs.crs_wkt)
crs_4326 = CRS.from_epsg(4326)
xformer = Transformer.from_crs(data_crs, crs_4326, always_xy=True)
@@ -73,6 +75,10 @@ def spatial_values(netcdf):
def pixel_padding(netcdf):
# Adding padding should give us values that match up to the
# netcdf.attrs.geospatial_bounds
# instead of using Geotransform:
# if x and y have atrributes valid_range then different between
# valid range and first x value for example should be padding
# if no valid range attribute then look for pixel size value in ini
return abs(float(netcdf.crs.GeoTransform.split()[1])) / 2


@@ -132,6 +138,10 @@ def index_subset(original_length):
else:
return list(range(original_length))

# no date modified in netcdf global attributes, then retrieve from configuration
def date_modified(netcdf, configuration):
datetime_str = netcdf.attrs['date_modified'] if 'date_modified' in netcdf.attrs else configuration.date_modified
ensure_iso(datetime_str)

def ensure_iso(datetime_str):
"""
6 changes: 5 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,11 @@ def expected_keys():
"checksum_type",
"number",
"dry_run",
"data_reader",
"filename_regex",
"time_coverage_duration",
"date_modified",
"geospatial_x_resolution",
"geospatial_y_resolution",
]
)

0 comments on commit a4b6c14

Please sign in to comment.