From 421d068fc8c59d9a3b00d747a61554177a635824 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 10:00:23 +0100 Subject: [PATCH 01/18] icdc initial commit --- src/preprocess/icdc.py | 111 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 src/preprocess/icdc.py diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py new file mode 100644 index 000000000..eae2820ce --- /dev/null +++ b/src/preprocess/icdc.py @@ -0,0 +1,111 @@ +from pathlib import Path +import xarray as xr +from shutil import rmtree +from typing import Optional + +from .base import BasePreProcessor + + +class ICDCPreprocessor(BasePreProcessor): + """ For working with data on ICDC (TEMP.) + """ + variable: str + icdc_data_dir = Path('/pool/data/ICDC/land/') + + def get_filepaths(self) -> List[Path]: + dir = self.icdc_data_dir / self.variable / 'DATA' + years = [d.name for d in dir.iterdir() if d.is_dir()] + + filepaths = [] + for year in years: + filepaths.extend((dir / year).glob('*.nc')) + + return filepaths + + @staticmethod + def create_filename(netcdf_filename: str, + subset_name: Optional[str] = None) -> str: + """ + {base_str}.nc + """ + filename_stem = netcdf_filename[:-3] + if subset_name is not None: + new_filename = f'{filename_stem}_{subset_name}.nc' + else: + new_filename = f'{filename_stem}.nc' + return new_filename + + def _preprocess_single(self, netcdf_filepath: Path, + subset_str: Optional[str] = 'kenya', + regrid: Optional[xr.Dataset] = None) -> None: + """Run the Preprocessing steps for the GLEAM data + + Process: + ------- + * chop out ROI + * create new dataset with regrid dimensions + * Save the output file to new folder + """ + print(f'Starting work on {netcdf_filepath.name}') + # 1. read in the dataset + ds = xr.open_dataset(netcdf_filepath) + + # 2. chop out EastAfrica + if subset_str is not None: + ds = self.chop_roi(ds, subset_str, inverse_lat=True) + + if regrid is not None: + ds = self.regrid(ds, regrid) + + # 6. create the filepath and save to that location + assert netcdf_filepath.name[-3:] == '.nc', \ + f'filepath name should be a .nc file. Currently: {netcdf_filepath.name}' + + filename = self.create_filename( + netcdf_filepath.name, + subset_name=subset_str if subset_str is not None else None + ) + print(f"Saving to {self.interim}/{filename}") + ds.to_netcdf(self.interim / filename) + + print(f"** Done for {self.variable} {netcdf_filepath.name} **") + + def merge_files(self): + pass + + def preprocess(self, subset_str: Optional[str] = 'kenya', + regrid: Optional[Path] = None, + resample_time: Optional[str] = 'M', + upsampling: bool = False, + cleanup: bool = True) -> None: + """ Preprocess all of the GLEAM .nc files to produce + one subset file. + + Arguments + ---------- + subset_str: Optional[str] = 'kenya' + Whether to subset Kenya when preprocessing + regrid: Optional[Path] = None + If a Path is passed, the CHIRPS files will be regridded to have the same + grid as the dataset at that Path. If None, no regridding happens + resample_time: str = 'M' + If not None, defines the time length to which the data will be resampled + upsampling: bool = False + If true, tells the class the time-sampling will be upsampling. In this case, + nearest instead of mean is used for the resampling + cleanup: bool = True + If true, delete interim files created by the class + """ + nc_files = self.get_filepaths() + + if regrid is not None: + regrid = self.load_reference_grid(regrid) + + for file in nc_files: + self._preprocess_single(file, subset_str, regrid) + + # merge all of the timesteps + self.merge_files(subset_str, resample_time, upsampling) + + if cleanup: + rmtree(self.interim) From 15743d8884ed2c27f5206604dc74eca90c8a4574 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 10:06:20 +0100 Subject: [PATCH 02/18] add instances --- src/preprocess/icdc.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index eae2820ce..21743d968 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -109,3 +109,15 @@ def preprocess(self, subset_str: Optional[str] = 'kenya', if cleanup: rmtree(self.interim) + + +class ESACCISoilMoisture(ICDCPreprocessor): + variable = 'esa_cci_soilmoisture' + + +class LAIModisAvhrr(ICDCPreprocessor): + variable = 'avhrr_modis_lai' + + +class ModisNDVI(ICDCPreprocessor): + variable = 'modis_aqua_vegetationindex' From 5a1a52f13cb733b4100c07a38ef32157bd37a048 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 10:25:15 +0100 Subject: [PATCH 03/18] update flake etc. --- scripts/drafts/icdc.py | 9 +++++++++ src/preprocess/icdc.py | 7 ++----- 2 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 scripts/drafts/icdc.py diff --git a/scripts/drafts/icdc.py b/scripts/drafts/icdc.py new file mode 100644 index 000000000..2a0783645 --- /dev/null +++ b/scripts/drafts/icdc.py @@ -0,0 +1,9 @@ +from src.preprocess.icdc import ( + ESACCISoilMoisture, + LAIModisAvhrr, + ModisNDVI +) + +processor = ModisNDVI() + +processor.preprocess() diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 21743d968..1030899bc 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -1,7 +1,7 @@ from pathlib import Path import xarray as xr from shutil import rmtree -from typing import Optional +from typing import Optional, List from .base import BasePreProcessor @@ -16,7 +16,7 @@ def get_filepaths(self) -> List[Path]: dir = self.icdc_data_dir / self.variable / 'DATA' years = [d.name for d in dir.iterdir() if d.is_dir()] - filepaths = [] + filepaths: List = [] for year in years: filepaths.extend((dir / year).glob('*.nc')) @@ -70,9 +70,6 @@ def _preprocess_single(self, netcdf_filepath: Path, print(f"** Done for {self.variable} {netcdf_filepath.name} **") - def merge_files(self): - pass - def preprocess(self, subset_str: Optional[str] = 'kenya', regrid: Optional[Path] = None, resample_time: Optional[str] = 'M', From dcdcc315d22281f488ec3b680c10d42fab12fa2e Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 10:28:07 +0100 Subject: [PATCH 04/18] use dataset --- src/preprocess/icdc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 1030899bc..397a34235 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -13,7 +13,7 @@ class ICDCPreprocessor(BasePreProcessor): icdc_data_dir = Path('/pool/data/ICDC/land/') def get_filepaths(self) -> List[Path]: - dir = self.icdc_data_dir / self.variable / 'DATA' + dir = self.icdc_data_dir / self.dataset / 'DATA' years = [d.name for d in dir.iterdir() if d.is_dir()] filepaths: List = [] @@ -109,12 +109,12 @@ def preprocess(self, subset_str: Optional[str] = 'kenya', class ESACCISoilMoisture(ICDCPreprocessor): - variable = 'esa_cci_soilmoisture' + dataset = 'esa_cci_soilmoisture' class LAIModisAvhrr(ICDCPreprocessor): - variable = 'avhrr_modis_lai' + dataset = 'avhrr_modis_lai' class ModisNDVI(ICDCPreprocessor): - variable = 'modis_aqua_vegetationindex' + dataset = 'modis_aqua_vegetationindex' From 6b5106deb888a0af2d079c305cb95a8162667d0f Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 10:29:30 +0100 Subject: [PATCH 05/18] use dataset --- src/preprocess/icdc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 397a34235..03d163bc2 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -68,7 +68,7 @@ def _preprocess_single(self, netcdf_filepath: Path, print(f"Saving to {self.interim}/{filename}") ds.to_netcdf(self.interim / filename) - print(f"** Done for {self.variable} {netcdf_filepath.name} **") + print(f"** Done for {self.dataset} {netcdf_filepath.name} **") def preprocess(self, subset_str: Optional[str] = 'kenya', regrid: Optional[Path] = None, From 855e727f8b9ff5f19a1a834c58fe1ade493a1fb2 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 13:20:56 +0100 Subject: [PATCH 06/18] update get filepaths --- scripts/drafts/icdc.py | 6 +++++- src/preprocess/icdc.py | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/drafts/icdc.py b/scripts/drafts/icdc.py index 2a0783645..e0e09ae9e 100644 --- a/scripts/drafts/icdc.py +++ b/scripts/drafts/icdc.py @@ -5,5 +5,9 @@ ) processor = ModisNDVI() +# processor.preprocess() -processor.preprocess() +subset_str='kenya' +resample_time='M' +upsampling=False +processor.merge_files(subset_str, resample_time, upsampling) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 03d163bc2..4da2996b3 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -12,7 +12,7 @@ class ICDCPreprocessor(BasePreProcessor): variable: str icdc_data_dir = Path('/pool/data/ICDC/land/') - def get_filepaths(self) -> List[Path]: + def get_icdc_filepaths(self) -> List[Path]: dir = self.icdc_data_dir / self.dataset / 'DATA' years = [d.name for d in dir.iterdir() if d.is_dir()] @@ -74,7 +74,7 @@ def preprocess(self, subset_str: Optional[str] = 'kenya', regrid: Optional[Path] = None, resample_time: Optional[str] = 'M', upsampling: bool = False, - cleanup: bool = True) -> None: + cleanup: bool = False) -> None: """ Preprocess all of the GLEAM .nc files to produce one subset file. @@ -93,7 +93,7 @@ def preprocess(self, subset_str: Optional[str] = 'kenya', cleanup: bool = True If true, delete interim files created by the class """ - nc_files = self.get_filepaths() + nc_files = self.get_icdc_filepaths() if regrid is not None: regrid = self.load_reference_grid(regrid) From 3f1804c61324b997a3c9db061948a14002fb9de5 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 13:27:59 +0100 Subject: [PATCH 07/18] update icdc --- scripts/drafts/icdc.py | 25 +++++++++++++++++++------ src/preprocess/icdc.py | 6 +++++- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/scripts/drafts/icdc.py b/scripts/drafts/icdc.py index e0e09ae9e..b682d2d5a 100644 --- a/scripts/drafts/icdc.py +++ b/scripts/drafts/icdc.py @@ -4,10 +4,23 @@ ModisNDVI ) -processor = ModisNDVI() -# processor.preprocess() -subset_str='kenya' -resample_time='M' -upsampling=False -processor.merge_files(subset_str, resample_time, upsampling) +def modis_ndvi(): + processor = ModisNDVI() + processor.preprocess() + + +def cci_soil_moisture(): + processor = ESACCISoilMoisture() + processor.preprocess() + + +def modis_lai(): + processor = LAIModisAvhrr() + processor.preprocess() + + +if __name__ == '__main__': + modis_ndvi() + cci_soil_moisture() + modis_lai() diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 4da2996b3..4c9a6a84f 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -20,7 +20,11 @@ def get_icdc_filepaths(self) -> List[Path]: for year in years: filepaths.extend((dir / year).glob('*.nc')) - return filepaths + if filepaths != []: + return filepaths + else: + filepaths.extend((dir).glob('*.nc')) + return filepaths @staticmethod def create_filename(netcdf_filename: str, From 68f66dcd16dde4d3f21510a8d57430dd8de4bd35 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Mon, 5 Aug 2019 14:02:05 +0100 Subject: [PATCH 08/18] add alot of preprocessors --- src/preprocess/icdc.py | 110 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 4c9a6a84f..77dd8eab0 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -10,7 +10,7 @@ class ICDCPreprocessor(BasePreProcessor): """ For working with data on ICDC (TEMP.) """ variable: str - icdc_data_dir = Path('/pool/data/ICDC/land/') + icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/') def get_icdc_filepaths(self) -> List[Path]: dir = self.icdc_data_dir / self.dataset / 'DATA' @@ -113,12 +113,120 @@ def preprocess(self, subset_str: Optional[str] = 'kenya', class ESACCISoilMoisture(ICDCPreprocessor): + source = 'land' dataset = 'esa_cci_soilmoisture' class LAIModisAvhrr(ICDCPreprocessor): + source = 'land' dataset = 'avhrr_modis_lai' class ModisNDVI(ICDCPreprocessor): + source = 'land' dataset = 'modis_aqua_vegetationindex' + + +class AMSRESoilMoisture(ICDCPreprocessor): + source = 'land' + dataset = 'amsre_soilmoisture' + + +class ASCATSoilMoisture(ICDCPreprocessor): + source = 'land' + dataset = 'ascat_soilmoisture' + + +class EUMetsatAlbedo(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_albedo' + + +class EUMetSatAlbedo2(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_clara2_surfacealbedo' + + +class EUMetSatRadiation(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_clara2_surfaceradiation' + + +class EUMetSatIrradiance(ICDCPreprocessor): + source = 'land' + dataset = 'eumetsat_surfacesolarirradiance' + + +class SpotFAPAR(ICDCPreprocessor): + source = 'land' + dataset = 'fapar_spot_proba_v' + + +class GLEAMEvaporation(ICDCPreprocessor): + source = 'land' + dataset = 'gleam_evaporation' + + +class SpotLai(ICDCPreprocessor): + source = 'land' + dataset = 'lai_spot_proba_v' + + +class SpotLSAlbedo(ICDCPreprocessor): + source = 'land' + dataset = 'land_surface_albedo_spot' + + +class ModisAlbedo(ICDCPreprocessor): + source = 'land' + dataset = 'modis_albedo' + + +class ModisForestCover(ICDCPreprocessor): + source = 'land' + dataset = 'modis_forestcoverfraction' + + +class ModisLandcover(ICDCPreprocessor): + source = 'land' + dataset = 'modis_landcover' + + +class ModisLatLon(ICDCPreprocessor): + source = 'land' + dataset = 'modis_latlon' + + +class ModisLSTClimatology(ICDCPreprocessor): + source = 'land' + dataset = 'modis_lst_climatology' + + +class ModisNPP(ICDCPreprocessor): + source = 'land' + dataset = 'modis_primary_production' + + +class ModisSRTM(ICDCPreprocessor): + source = 'land' + dataset = 'modis-srtm_landwaterdistribution' + + +class ModisLST(ICDCPreprocessor): + source = 'land' + dataset = 'modis_terra_landsurfacetemperature' + + +class SMOSSoilMoisture(ICDCPreprocessor): + source = 'land' + dataset = 'smos_soilmoisture' + + +class Topography(ICDCPreprocessor): + source = 'land' + dataset = 'topography' + + +class SpotVegetationCoverFraction(ICDCPreprocessor): + source = 'land' + dataset = 'vegetationcoverfraction_spot_proba_v' From 0bf1b095cd35df05469ce21f577878131aa3ed09 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 6 Aug 2019 10:08:15 +0100 Subject: [PATCH 09/18] proper init --- src/preprocess/icdc.py | 56 ++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 77dd8eab0..a3b98aa27 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -9,8 +9,12 @@ class ICDCPreprocessor(BasePreProcessor): """ For working with data on ICDC (TEMP.) """ - variable: str - icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/') + variable: str # the name of the variable on icdc + source: str # {'land', 'atmosphere', 'climate_indices', 'ocean', 'ice_and_snow'} + + def __init__(self, data_folder: Path = Path('data')) -> None: + super().__init__(data_folder) + icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/') def get_icdc_filepaths(self) -> List[Path]: dir = self.icdc_data_dir / self.dataset / 'DATA' @@ -112,121 +116,121 @@ def preprocess(self, subset_str: Optional[str] = 'kenya', rmtree(self.interim) -class ESACCISoilMoisture(ICDCPreprocessor): +class ESACCISoilMoisturePreprocessor(ICDCPreprocessor): source = 'land' dataset = 'esa_cci_soilmoisture' -class LAIModisAvhrr(ICDCPreprocessor): +class LAIModisAvhrrPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'avhrr_modis_lai' -class ModisNDVI(ICDCPreprocessor): +class ModisNDVIPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_aqua_vegetationindex' -class AMSRESoilMoisture(ICDCPreprocessor): +class AMSRESoilMoisturePreprocessor(ICDCPreprocessor): source = 'land' dataset = 'amsre_soilmoisture' -class ASCATSoilMoisture(ICDCPreprocessor): +class ASCATSoilMoisturePreprocessor(ICDCPreprocessor): source = 'land' dataset = 'ascat_soilmoisture' -class EUMetsatAlbedo(ICDCPreprocessor): +class EUMetsatAlbedoPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'eumetsat_albedo' -class EUMetSatAlbedo2(ICDCPreprocessor): +class EUMetSatAlbedo2Preprocessor(ICDCPreprocessor): source = 'land' dataset = 'eumetsat_clara2_surfacealbedo' -class EUMetSatRadiation(ICDCPreprocessor): +class EUMetSatRadiationPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'eumetsat_clara2_surfaceradiation' -class EUMetSatIrradiance(ICDCPreprocessor): +class EUMetSatIrradiancePreprocessor(ICDCPreprocessor): source = 'land' dataset = 'eumetsat_surfacesolarirradiance' -class SpotFAPAR(ICDCPreprocessor): +class SpotFAPARPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'fapar_spot_proba_v' -class GLEAMEvaporation(ICDCPreprocessor): +class GLEAMEvaporationPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'gleam_evaporation' -class SpotLai(ICDCPreprocessor): +class SpotLaiPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'lai_spot_proba_v' -class SpotLSAlbedo(ICDCPreprocessor): +class SpotLSAlbedoPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'land_surface_albedo_spot' -class ModisAlbedo(ICDCPreprocessor): +class ModisAlbedoPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_albedo' -class ModisForestCover(ICDCPreprocessor): +class ModisForestCoverPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_forestcoverfraction' -class ModisLandcover(ICDCPreprocessor): +class ModisLandcoverPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_landcover' -class ModisLatLon(ICDCPreprocessor): +class ModisLatLonPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_latlon' -class ModisLSTClimatology(ICDCPreprocessor): +class ModisLSTClimatologyPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_lst_climatology' -class ModisNPP(ICDCPreprocessor): +class ModisNPPPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_primary_production' -class ModisSRTM(ICDCPreprocessor): +class ModisSRTMPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis-srtm_landwaterdistribution' -class ModisLST(ICDCPreprocessor): +class ModisLSTPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'modis_terra_landsurfacetemperature' -class SMOSSoilMoisture(ICDCPreprocessor): +class SMOSSoilMoisturePreprocessor(ICDCPreprocessor): source = 'land' dataset = 'smos_soilmoisture' -class Topography(ICDCPreprocessor): +class TopographyPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'topography' -class SpotVegetationCoverFraction(ICDCPreprocessor): +class SpotVegetationCoverFractionPreprocessor(ICDCPreprocessor): source = 'land' dataset = 'vegetationcoverfraction_spot_proba_v' From afa8d2255bcd7c8b136a9e7214d5dc1d404c357e Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 6 Aug 2019 10:56:10 +0100 Subject: [PATCH 10/18] add testing --- src/preprocess/icdc.py | 11 +-- tests/preprocess/test_icdc.py | 126 ++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 tests/preprocess/test_icdc.py diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index a3b98aa27..e37bc2500 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -7,14 +7,14 @@ class ICDCPreprocessor(BasePreProcessor): - """ For working with data on ICDC (TEMP.) + """ For working with data on ICDC (SPECIFIC to Uni Server) """ variable: str # the name of the variable on icdc source: str # {'land', 'atmosphere', 'climate_indices', 'ocean', 'ice_and_snow'} def __init__(self, data_folder: Path = Path('data')) -> None: - super().__init__(data_folder) - icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/') + super().__init__(data_folder) + self.icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/') def get_icdc_filepaths(self) -> List[Path]: dir = self.icdc_data_dir / self.dataset / 'DATA' @@ -60,7 +60,10 @@ def _preprocess_single(self, netcdf_filepath: Path, # 2. chop out EastAfrica if subset_str is not None: - ds = self.chop_roi(ds, subset_str, inverse_lat=True) + try: + ds = self.chop_roi(ds, subset_str, inverse_lat=True) + except AssertionError: + ds = self.chop_roi(ds, subset_str, inverse_lat=False) if regrid is not None: ds = self.regrid(ds, regrid) diff --git a/tests/preprocess/test_icdc.py b/tests/preprocess/test_icdc.py new file mode 100644 index 000000000..e726bd9eb --- /dev/null +++ b/tests/preprocess/test_icdc.py @@ -0,0 +1,126 @@ +import xarray as xr +import numpy as np +from datetime import datetime + +from src.preprocess.icdc import ICDCPreprocessor, LAIModisAvhrrPreprocessor +from src.utils import get_kenya +from ..utils import _make_dataset + + +class TestICDCPreprocessor: + + @staticmethod + def test_make_filename(): + test_file = 'testy_test.nc' + expected_output = 'testy_test_kenya.nc' + + filename = ICDCPreprocessor.create_filename(test_file, 'kenya') + assert filename == expected_output, \ + f'Expected output to be {expected_output}, got {filename}' + + @staticmethod + def _make_icdc_dataset(size, lonmin=-180.0, lonmax=180.0, + latmin=-55.152, latmax=75.024, + add_times=True): + lat_len, lon_len = size + # create the vector + longitudes = np.linspace(lonmin, lonmax, lon_len) + latitudes = np.linspace(latmin, latmax, lat_len) + + dims = ['lat', 'lon'] + coords = {'lat': latitudes, + 'lon': longitudes} + + if add_times: + size = (2, size[0], size[1]) + dims.insert(0, 'time') + coords['time'] = [datetime(2019, 1, 1), datetime(2019, 1, 2)] + values = np.random.randint(100, size=size) + + return xr.Dataset({'lai': (dims, values)}, coords=coords) + + def _save_icdc_data(self, fpath, size): + ds = self._make_icdc_dataset(size) + if not fpath.parents[0].exists(): + fpath.parents[0].mkdir(parents=True, exist_ok=True) + ds.to_netcdf(fpath) + + @staticmethod + def test_directories_created(tmp_path): + v = LAIModisAvhrrPreprocessor(tmp_path) + + assert ( + tmp_path / v.preprocessed_folder / 'avhrr_modis_lai_preprocessed' + ).exists(), \ + 'Should have created a directory tmp_path/interim/avhrr_modis_lai_preprocessed' + + assert ( + tmp_path / v.preprocessed_folder / 'avhrr_modis_lai_interim' + ).exists(), \ + 'Should have created a directory tmp_path/interim/chirps_interim' + + @staticmethod + def test_get_filenames(tmp_path): + icdc_data_dir = tmp_path / 'pool' / 'data' / 'ICDC' + icdc_path = icdc_data_dir / 'avhrr_modis_lai' / 'DATA' + (icdc_path).mkdir(parents=True) + + test_file = icdc_path / 'testy_test.nc' + test_file.touch() + + processor = LAIModisAvhrrPreprocessor(tmp_path) + + # overwrite internal icdc_data_dir to mock behaviour + processor.icdc_data_dir = icdc_data_dir + + files = processor.get_icdc_filepaths() + assert files[0] == test_file, f'Expected {test_file} to be retrieved' + + def test_preprocess(self, tmp_path): + icdc_data_dir = tmp_path / 'pool' / 'data' / 'ICDC' + icdc_path = icdc_data_dir / 'avhrr_modis_lai' / 'DATA' + (icdc_path).mkdir(parents=True) + icdc_path = icdc_path / 'GlobMap_V01_LAI__2005097__UHAM-ICDC.nc' + self._save_icdc_data(icdc_path, (50, 50)) + + kenya = get_kenya() + regrid_dataset, _, _ = _make_dataset(size=(20, 20), + latmin=kenya.latmin, latmax=kenya.latmax, + lonmin=kenya.lonmin, lonmax=kenya.lonmax) + + regrid_path = tmp_path / 'regridder.nc' + regrid_dataset.to_netcdf(regrid_path) + + processor = LAIModisAvhrrPreprocessor(tmp_path) + # overwrite internal icdc_data_dir to mock behaviour + processor.icdc_data_dir = icdc_data_dir + + processor.preprocess( + subset_str='kenya', regrid=regrid_path, cleanup=True + ) + + expected_out_path = tmp_path / 'interim/' \ + 'avhrr_modis_lai_preprocessed/avhrr_modis_lai_kenya.nc' + assert expected_out_path.exists(), \ + f'Expected processed file to be saved to {expected_out_path}' + + # check the subsetting happened correctly + out_data = xr.open_dataset(expected_out_path) + expected_dims = ['lat', 'lon', 'time'] + assert len(list(out_data.dims)) == len(expected_dims) + for dim in expected_dims: + assert dim in list(out_data.dims), \ + f'Expected {dim} to be in the processed dataset dims' + + lons = out_data.lon.values + assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \ + 'Longitudes not correctly subset' + + lats = out_data.lat.values + assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \ + 'Latitudes not correctly subset' + + assert out_data.lai.values.shape[1:] == (20, 20) + + assert not processor.interim.exists(), \ + f'Interim chirps folder should have been deleted' From 23b70ae05db502705b3eaf252c50d28a143614e2 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Tue, 6 Aug 2019 11:00:21 +0100 Subject: [PATCH 11/18] add docs --- src/preprocess/icdc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index e37bc2500..5bbd21e33 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -46,7 +46,8 @@ def create_filename(netcdf_filename: str, def _preprocess_single(self, netcdf_filepath: Path, subset_str: Optional[str] = 'kenya', regrid: Optional[xr.Dataset] = None) -> None: - """Run the Preprocessing steps for the GLEAM data + """Run the Preprocessing steps for the data stored on ICDC + https://icdc.cen.uni-hamburg.de/1/daten.html Process: ------- From 31b2653f829271b4a5fa331230f26d879b9faee4 Mon Sep 17 00:00:00 2001 From: Tommy Lees Date: Mon, 12 Aug 2019 11:55:22 +0100 Subject: [PATCH 12/18] Update icdc.py --- src/preprocess/icdc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 5bbd21e33..f431f4dab 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -7,7 +7,7 @@ class ICDCPreprocessor(BasePreProcessor): - """ For working with data on ICDC (SPECIFIC to Uni Server) + """ For working with data on ICDC (SPECIFIC to one university Server) """ variable: str # the name of the variable on icdc source: str # {'land', 'atmosphere', 'climate_indices', 'ocean', 'ice_and_snow'} From 113e5e89ca798a5b1eca1acbc9b4af24a18fc398 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Fri, 6 Sep 2019 14:34:51 +0100 Subject: [PATCH 13/18] create new ethiopia_safe region --- src/utils.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/utils.py b/src/utils.py index 56b062075..c1a48d404 100644 --- a/src/utils.py +++ b/src/utils.py @@ -26,8 +26,17 @@ def get_kenya() -> Region: def get_ethiopia() -> Region: - return Region(name='ethiopia', lonmin=32.9975838, lonmax=47.9823797, - latmin=3.397448, latmax=14.8940537) + return Region( + name='ethiopia', lonmin=32.9975838, lonmax=47.9823797, + latmin=3.397448, latmax=14.8940537 + ) + + +def get_ethiopia_safe() -> Region: + return Region( + name='ethiopia_safe', lonmin=30, lonmax=50, + latmin=2, latmax=15 + ) def get_east_africa() -> Region: @@ -181,5 +190,6 @@ def drop_nans_and_flatten(dataArray: xr.DataArray) -> np.ndarray: region_lookup = { "kenya": get_kenya(), "ethiopia": get_ethiopia(), + "ethiopia_safe": get_ethiopia_safe(), "east_africa": get_east_africa(), } From 50e84b973dc20ac183024c82f3906a09bd364a5d Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Fri, 6 Sep 2019 14:39:54 +0100 Subject: [PATCH 14/18] preprocess script for ethiopia region --- scripts/preprocess.py | 37 +++++++++++++++++++++++++++++-------- src/preprocess/icdc.py | 2 +- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index d64250a37..e5a865b7e 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -9,7 +9,6 @@ from src.preprocess.admin_boundaries import KenyaAdminPreprocessor - def process_vci_2018(): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': @@ -127,11 +126,33 @@ def preprocess_era5(): processor.preprocess(subset_str='kenya', regrid=regrid_path) +def preprocess_icdc(): + from src.preprocess.icdc import ( + ModisNDVIPreprocessor, + ModisLSTPreprocessor, + ) + if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': + data_path = Path('data') + else: + data_path = Path('../data') + processor = ModisNDVIPreprocessor(data_path) + processor.preprocess( + subset_str='ethiopia_safe', + regrid=regrid_path + ) + + processor = ModisLSTPreprocessor(data_path) + processor.preprocess( + subset_str='ethiopia_safe', + regrid=regrid_path + ) + if __name__ == '__main__': - process_vci_2018() - process_precip_2018() - process_era5POS_2018() - process_gleam() - process_esa_cci_landcover() - preprocess_srtm() - preprocess_era5() + # process_vci_2018() + # process_precip_2018() + # process_era5POS_2018() + # process_gleam() + # process_esa_cci_landcover() + # preprocess_srtm() + # preprocess_era5() + preprocess_icdc() diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index f431f4dab..5bbd21e33 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -7,7 +7,7 @@ class ICDCPreprocessor(BasePreProcessor): - """ For working with data on ICDC (SPECIFIC to one university Server) + """ For working with data on ICDC (SPECIFIC to Uni Server) """ variable: str # the name of the variable on icdc source: str # {'land', 'atmosphere', 'climate_indices', 'ocean', 'ice_and_snow'} From b8c3ce1a04c45ee08773f70f886339a208f2709a Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Fri, 6 Sep 2019 14:42:09 +0100 Subject: [PATCH 15/18] fix script --- scripts/preprocess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index e5a865b7e..b82809ee7 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -138,13 +138,11 @@ def preprocess_icdc(): processor = ModisNDVIPreprocessor(data_path) processor.preprocess( subset_str='ethiopia_safe', - regrid=regrid_path ) processor = ModisLSTPreprocessor(data_path) processor.preprocess( - subset_str='ethiopia_safe', - regrid=regrid_path + subset_str='ethiopia_safe' ) if __name__ == '__main__': From 46ebc9c703f60d0b50b0777e73161d9d0deee23f Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Fri, 6 Sep 2019 15:03:10 +0100 Subject: [PATCH 16/18] extend for other directory formats --- src/preprocess/icdc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 5bbd21e33..73aea210b 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -28,7 +28,13 @@ def get_icdc_filepaths(self) -> List[Path]: return filepaths else: filepaths.extend((dir).glob('*.nc')) - return filepaths + + if filepaths != []: + return filepaths + + else: + filepaths.extend((dir).glob('**/*.nc')) + return filepaths @staticmethod def create_filename(netcdf_filename: str, From 5a50fa901dd94335ff15943a7bb26a609169ef9f Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Fri, 6 Sep 2019 15:16:40 +0100 Subject: [PATCH 17/18] monthly data for lst modis --- src/preprocess/icdc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index 73aea210b..c0af435e7 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -33,7 +33,8 @@ def get_icdc_filepaths(self) -> List[Path]: return filepaths else: - filepaths.extend((dir).glob('**/*.nc')) + # HACKY: for the lst dataset + filepaths.extend((dir).glob('MONTHLY/*.nc')) return filepaths @staticmethod From 26cd1aefd95c8b82e210d2f1952275ca8dd4c500 Mon Sep 17 00:00:00 2001 From: tommylees112 Date: Fri, 6 Sep 2019 15:20:28 +0100 Subject: [PATCH 18/18] monthly data for lst modis --- src/preprocess/icdc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/preprocess/icdc.py b/src/preprocess/icdc.py index c0af435e7..178194ffa 100644 --- a/src/preprocess/icdc.py +++ b/src/preprocess/icdc.py @@ -34,7 +34,7 @@ def get_icdc_filepaths(self) -> List[Path]: else: # HACKY: for the lst dataset - filepaths.extend((dir).glob('MONTHLY/*.nc')) + filepaths.extend((dir).glob('MONTHLY/**/*.nc')) return filepaths @staticmethod