diff --git a/scripts/drafts/icdc.py b/scripts/drafts/icdc.py new file mode 100644 index 000000000..b682d2d5a --- /dev/null +++ b/scripts/drafts/icdc.py @@ -0,0 +1,26 @@ +from src.preprocess.icdc import ( + ESACCISoilMoisture, + LAIModisAvhrr, + ModisNDVI +) + + +def modis_ndvi(): + processor = ModisNDVI() + processor.preprocess() + + +def cci_soil_moisture(): + processor = ESACCISoilMoisture() + processor.preprocess() + + +def modis_lai(): + processor = LAIModisAvhrr() + processor.preprocess() + + +if __name__ == '__main__': + modis_ndvi() + cci_soil_moisture() + modis_lai() diff --git a/scripts/preprocess.py b/scripts/preprocess.py index d64250a37..b82809ee7 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -9,7 +9,6 @@ from src.preprocess.admin_boundaries import KenyaAdminPreprocessor - def process_vci_2018(): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': @@ -127,11 +126,31 @@ def preprocess_era5(): processor.preprocess(subset_str='kenya', regrid=regrid_path) +def preprocess_icdc(): + from src.preprocess.icdc import ( + ModisNDVIPreprocessor, + ModisLSTPreprocessor, + ) + if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': + data_path = Path('data') + else: + data_path = Path('../data') + processor = ModisNDVIPreprocessor(data_path) + processor.preprocess( + subset_str='ethiopia_safe', + ) + + processor = ModisLSTPreprocessor(data_path) + processor.preprocess( + subset_str='ethiopia_safe' + ) + if __name__ == '__main__': - process_vci_2018() - process_precip_2018() - process_era5POS_2018() - process_gleam() - process_esa_cci_landcover() - preprocess_srtm() - preprocess_era5() + # process_vci_2018() + # process_precip_2018() + # process_era5POS_2018() + # process_gleam() + # process_esa_cci_landcover() + # preprocess_srtm() + # preprocess_era5() + preprocess_icdc() diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py new file mode 100644 index 000000000..178194ffa --- /dev/null +++ b/src/preprocess/icdc.py @@ -0,0 +1,247 @@ +from pathlib import Path +import xarray as xr +from shutil import rmtree +from typing import Optional, List + +from .base import BasePreProcessor + + +class ICDCPreprocessor(BasePreProcessor): + """ For working with data on ICDC (SPECIFIC to Uni Server) + """ + variable: str # the name of the variable on icdc + source: str # {'land', 'atmosphere', 'climate_indices', 'ocean', 'ice_and_snow'} + + def __init__(self, data_folder: Path = Path('data')) -> None: + super().__init__(data_folder) + self.icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/') + + def get_icdc_filepaths(self) -> List[Path]: + dir = self.icdc_data_dir / self.dataset / 'DATA' + years = [d.name for d in dir.iterdir() if d.is_dir()] + + filepaths: List = [] + for year in years: + filepaths.extend((dir / year).glob('*.nc')) + + if filepaths != []: + return filepaths + else: + filepaths.extend((dir).glob('*.nc')) + + if filepaths != []: + return filepaths + + else: + # HACKY: for the lst dataset + filepaths.extend((dir).glob('MONTHLY/**/*.nc')) + return filepaths + + @staticmethod + def create_filename(netcdf_filename: str, + subset_name: Optional[str] = None) -> str: + """ + {base_str}.nc + """ + filename_stem = netcdf_filename[:-3] + if subset_name is not None: + new_filename = f'{filename_stem}_{subset_name}.nc' + else: + new_filename = f'{filename_stem}.nc' + return new_filename + + def _preprocess_single(self, netcdf_filepath: Path, + subset_str: Optional[str] = 'kenya', + regrid: Optional[xr.Dataset] = None) -> None: + """Run the Preprocessing steps for the data stored on ICDC + https://icdc.cen.uni-hamburg.de/1/daten.html + + Process: + ------- + * chop out ROI + * create new dataset with regrid dimensions + * Save the output file to new folder + """ + print(f'Starting work on {netcdf_filepath.name}') + # 1. read in the dataset + ds = xr.open_dataset(netcdf_filepath) + + # 2. chop out EastAfrica + if subset_str is not None: + try: + ds = self.chop_roi(ds, subset_str, inverse_lat=True) + except AssertionError: + ds = self.chop_roi(ds, subset_str, inverse_lat=False) + + if regrid is not None: + ds = self.regrid(ds, regrid) + + # 6. create the filepath and save to that location + assert netcdf_filepath.name[-3:] == '.nc', \ + f'filepath name should be a .nc file. Currently: {netcdf_filepath.name}' + + filename = self.create_filename( + netcdf_filepath.name, + subset_name=subset_str if subset_str is not None else None + ) + print(f"Saving to {self.interim}/{filename}") + ds.to_netcdf(self.interim / filename) + + print(f"** Done for {self.dataset} {netcdf_filepath.name} **") + + def preprocess(self, subset_str: Optional[str] = 'kenya', + regrid: Optional[Path] = None, + resample_time: Optional[str] = 'M', + upsampling: bool = False, + cleanup: bool = False) -> None: + """ Preprocess all of the GLEAM .nc files to produce + one subset file. + + Arguments + ---------- + subset_str: Optional[str] = 'kenya' + Whether to subset Kenya when preprocessing + regrid: Optional[Path] = None + If a Path is passed, the CHIRPS files will be regridded to have the same + grid as the dataset at that Path. If None, no regridding happens + resample_time: str = 'M' + If not None, defines the time length to which the data will be resampled + upsampling: bool = False + If true, tells the class the time-sampling will be upsampling. In this case, + nearest instead of mean is used for the resampling + cleanup: bool = True + If true, delete interim files created by the class + """ + nc_files = self.get_icdc_filepaths() + + if regrid is not None: + regrid = self.load_reference_grid(regrid) + + for file in nc_files: + self._preprocess_single(file, subset_str, regrid) + + # merge all of the timesteps + self.merge_files(subset_str, resample_time, upsampling) + + if cleanup: + rmtree(self.interim) + + +class ESACCISoilMoisturePreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'esa_cci_soilmoisture' + + +class LAIModisAvhrrPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'avhrr_modis_lai' + + +class ModisNDVIPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_aqua_vegetationindex' + + +class AMSRESoilMoisturePreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'amsre_soilmoisture' + + +class ASCATSoilMoisturePreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'ascat_soilmoisture' + + +class EUMetsatAlbedoPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_albedo' + + +class EUMetSatAlbedo2Preprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_clara2_surfacealbedo' + + +class EUMetSatRadiationPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_clara2_surfaceradiation' + + +class EUMetSatIrradiancePreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_surfacesolarirradiance' + + +class SpotFAPARPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'fapar_spot_proba_v' + + +class GLEAMEvaporationPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'gleam_evaporation' + + +class SpotLaiPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'lai_spot_proba_v' + + +class SpotLSAlbedoPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'land_surface_albedo_spot' + + +class ModisAlbedoPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_albedo' + + +class ModisForestCoverPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_forestcoverfraction' + + +class ModisLandcoverPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_landcover' + + +class ModisLatLonPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_latlon' + + +class ModisLSTClimatologyPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_lst_climatology' + + +class ModisNPPPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_primary_production' + + +class ModisSRTMPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis-srtm_landwaterdistribution' + + +class ModisLSTPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'modis_terra_landsurfacetemperature' + + +class SMOSSoilMoisturePreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'smos_soilmoisture' + + +class TopographyPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'topography' + + +class SpotVegetationCoverFractionPreprocessor(ICDCPreprocessor): + source = 'land' + dataset = 'vegetationcoverfraction_spot_proba_v' diff --git a/src/utils.py b/src/utils.py index 56b062075..c1a48d404 100644 --- a/src/utils.py +++ b/src/utils.py @@ -26,8 +26,17 @@ def get_kenya() -> Region: def get_ethiopia() -> Region: - return Region(name='ethiopia', lonmin=32.9975838, lonmax=47.9823797, - latmin=3.397448, latmax=14.8940537) + return Region( + name='ethiopia', lonmin=32.9975838, lonmax=47.9823797, + latmin=3.397448, latmax=14.8940537 + ) + + +def get_ethiopia_safe() -> Region: + return Region( + name='ethiopia_safe', lonmin=30, lonmax=50, + latmin=2, latmax=15 + ) def get_east_africa() -> Region: @@ -181,5 +190,6 @@ def drop_nans_and_flatten(dataArray: xr.DataArray) -> np.ndarray: region_lookup = { "kenya": get_kenya(), "ethiopia": get_ethiopia(), + "ethiopia_safe": get_ethiopia_safe(), "east_africa": get_east_africa(), } diff --git a/tests/preprocess/test_icdc.py b/tests/preprocess/test_icdc.py new file mode 100644 index 000000000..e726bd9eb --- /dev/null +++ b/tests/preprocess/test_icdc.py @@ -0,0 +1,126 @@ +import xarray as xr +import numpy as np +from datetime import datetime + +from src.preprocess.icdc import ICDCPreprocessor, LAIModisAvhrrPreprocessor +from src.utils import get_kenya +from ..utils import _make_dataset + + +class TestICDCPreprocessor: + + @staticmethod + def test_make_filename(): + test_file = 'testy_test.nc' + expected_output = 'testy_test_kenya.nc' + + filename = ICDCPreprocessor.create_filename(test_file, 'kenya') + assert filename == expected_output, \ + f'Expected output to be {expected_output}, got {filename}' + + @staticmethod + def _make_icdc_dataset(size, lonmin=-180.0, lonmax=180.0, + latmin=-55.152, latmax=75.024, + add_times=True): + lat_len, lon_len = size + # create the vector + longitudes = np.linspace(lonmin, lonmax, lon_len) + latitudes = np.linspace(latmin, latmax, lat_len) + + dims = ['lat', 'lon'] + coords = {'lat': latitudes, + 'lon': longitudes} + + if add_times: + size = (2, size[0], size[1]) + dims.insert(0, 'time') + coords['time'] = [datetime(2019, 1, 1), datetime(2019, 1, 2)] + values = np.random.randint(100, size=size) + + return xr.Dataset({'lai': (dims, values)}, coords=coords) + + def _save_icdc_data(self, fpath, size): + ds = self._make_icdc_dataset(size) + if not fpath.parents[0].exists(): + fpath.parents[0].mkdir(parents=True, exist_ok=True) + ds.to_netcdf(fpath) + + @staticmethod + def test_directories_created(tmp_path): + v = LAIModisAvhrrPreprocessor(tmp_path) + + assert ( + tmp_path / v.preprocessed_folder / 'avhrr_modis_lai_preprocessed' + ).exists(), \ + 'Should have created a directory tmp_path/interim/avhrr_modis_lai_preprocessed' + + assert ( + tmp_path / v.preprocessed_folder / 'avhrr_modis_lai_interim' + ).exists(), \ + 'Should have created a directory tmp_path/interim/chirps_interim' + + @staticmethod + def test_get_filenames(tmp_path): + icdc_data_dir = tmp_path / 'pool' / 'data' / 'ICDC' + icdc_path = icdc_data_dir / 'avhrr_modis_lai' / 'DATA' + (icdc_path).mkdir(parents=True) + + test_file = icdc_path / 'testy_test.nc' + test_file.touch() + + processor = LAIModisAvhrrPreprocessor(tmp_path) + + # overwrite internal icdc_data_dir to mock behaviour + processor.icdc_data_dir = icdc_data_dir + + files = processor.get_icdc_filepaths() + assert files[0] == test_file, f'Expected {test_file} to be retrieved' + + def test_preprocess(self, tmp_path): + icdc_data_dir = tmp_path / 'pool' / 'data' / 'ICDC' + icdc_path = icdc_data_dir / 'avhrr_modis_lai' / 'DATA' + (icdc_path).mkdir(parents=True) + icdc_path = icdc_path / 'GlobMap_V01_LAI__2005097__UHAM-ICDC.nc' + self._save_icdc_data(icdc_path, (50, 50)) + + kenya = get_kenya() + regrid_dataset, _, _ = _make_dataset(size=(20, 20), + latmin=kenya.latmin, latmax=kenya.latmax, + lonmin=kenya.lonmin, lonmax=kenya.lonmax) + + regrid_path = tmp_path / 'regridder.nc' + regrid_dataset.to_netcdf(regrid_path) + + processor = LAIModisAvhrrPreprocessor(tmp_path) + # overwrite internal icdc_data_dir to mock behaviour + processor.icdc_data_dir = icdc_data_dir + + processor.preprocess( + subset_str='kenya', regrid=regrid_path, cleanup=True + ) + + expected_out_path = tmp_path / 'interim/' \ + 'avhrr_modis_lai_preprocessed/avhrr_modis_lai_kenya.nc' + assert expected_out_path.exists(), \ + f'Expected processed file to be saved to {expected_out_path}' + + # check the subsetting happened correctly + out_data = xr.open_dataset(expected_out_path) + expected_dims = ['lat', 'lon', 'time'] + assert len(list(out_data.dims)) == len(expected_dims) + for dim in expected_dims: + assert dim in list(out_data.dims), \ + f'Expected {dim} to be in the processed dataset dims' + + lons = out_data.lon.values + assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \ + 'Longitudes not correctly subset' + + lats = out_data.lat.values + assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \ + 'Latitudes not correctly subset' + + assert out_data.lai.values.shape[1:] == (20, 20) + + assert not processor.interim.exists(), \ + f'Interim chirps folder should have been deleted'