From 8e65a19a74c056ddc8f7cf7072b8488a7e0cb572 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 9 Nov 2023 12:14:49 +1100 Subject: [PATCH] more logging to help troubleshooting --- uncoverml/features.py | 24 +++++++++++++++++++++--- uncoverml/geoio.py | 31 +++++++------------------------ uncoverml/predict.py | 2 +- uncoverml/validate.py | 4 ++-- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/uncoverml/features.py b/uncoverml/features.py index 161e3804..83d429d8 100644 --- a/uncoverml/features.py +++ b/uncoverml/features.py @@ -1,3 +1,5 @@ +from __future__ import division + import logging from typing import Optional from collections import OrderedDict @@ -12,13 +14,15 @@ from uncoverml import patch from uncoverml import transforms from uncoverml.config import Config -from uncoverml.geoio import RasterioImageSource +# from uncoverml.geoio import RasterioImageSource log = logging.getLogger(__name__) -def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize, - template_source: Optional[RasterioImageSource] = None): +def extract_subchunks(image_source, subchunk_index, n_subchunks, patchsize, + template_source): + # extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize, + # template_source: Optional[RasterioImageSource] = None): equiv_chunks = n_subchunks * mpiops.chunks equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize, template_source) @@ -71,7 +75,10 @@ def extract_features(image_source, targets, n_subchunks, patchsize): def transform_features(feature_sets, transform_sets, final_transform, config): # apply feature transforms + features = feature_names(config) + log.info(f"features are sorted as: \n {features}") transformed_vectors = [t(c) for c, t in zip(feature_sets, transform_sets)] + # TODO remove this when cubist gets removed if config.cubist or config.multicubist: feature_vec = OrderedDict() @@ -211,3 +218,14 @@ def remove_missing(x, targets=None): return x, classes +def feature_names(config: Config): + + results = [] + for s in config.feature_sets: + feats = [] + for tif in s.files: + name = basename(tif) + feats.append(name) + feats.sort() + results += feats + return results diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index 1d51acaa..8f9d8b37 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -1,5 +1,4 @@ from __future__ import division -from typing import Optional import joblib import os.path from subprocess import run @@ -14,11 +13,8 @@ import matplotlib.pyplot as plt import seaborn as sns import rasterio -from rasterio.warp import reproject from rasterio.windows import Window -from xgboost import XGBRegressor from sklearn.cluster import DBSCAN -from affine import Affine import numpy as np import shapefile import tables as hdf @@ -27,7 +23,7 @@ from uncoverml import mpiops from uncoverml import image -from uncoverml import features +from uncoverml import features as feat from uncoverml.config import Config from uncoverml.transforms import missing_percentage from uncoverml.targets import Targets @@ -436,19 +432,6 @@ def output_thumbnails(self, ratio=10): resample(f, output_tif=thumbnail, ratio=ratio) -def feature_names(config: Config): - - results = [] - for s in config.feature_sets: - feats = [] - for tif in s.files: - name = os.path.basename(tif) - feats.append(name) - feats.sort() - results += feats - return results - - def _iterate_sources(f, config: Config): results = [] @@ -502,7 +485,7 @@ def f(image_source: RasterioImageSource): template_source = RasterioImageSource(config.prediction_template) else: template_source = None - r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize, + r = feat.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize, template_source=template_source) return r result = _iterate_sources(f, config) @@ -522,7 +505,7 @@ def f(image_source): if config.intersected_features: r = extract_intersected_features(image_source, targets, config) else: - r = features.extract_features(image_source, targets, + r = feat.extract_features(image_source, targets, config.n_subchunks, config.patchsize) return r result = _iterate_sources(f, config) @@ -534,9 +517,9 @@ def semisupervised_feature_sets(targets, config: Config): frac = config.subsample_fraction def f(image_source): - r_t = features.extract_features(image_source, targets, n_subchunks=1, + r_t = feat.extract_features(image_source, targets, n_subchunks=1, patchsize=config.patchsize) - r_a = features.extract_subchunks(image_source, subchunk_index=0, + r_a = feat.extract_subchunks(image_source, subchunk_index=0, n_subchunks=1, patchsize=config.patchsize) if frac < 1.0: @@ -556,7 +539,7 @@ def unsupervised_feature_sets(config): frac = config.subsample_fraction def f(image_source): - r = features.extract_subchunks(image_source, subchunk_index=0, + r = feat.extract_subchunks(image_source, subchunk_index=0, n_subchunks=1, patchsize=config.patchsize) if frac < 1.0: @@ -739,7 +722,7 @@ def export_validation_scatter_plot_and_validation_csv(outfile_results, config: C def plot_feature_correlation_matrix(config: Config, x_all): fig, corr_ax = plt.subplots() - features = [Path(f).stem for f in feature_names(config)] + features = [Path(f).stem for f in feat.feature_names(config)] corr_df = pd.DataFrame(x_all) corr_df.columns = features sns.heatmap(corr_df.corr(), diff --git a/uncoverml/predict.py b/uncoverml/predict.py index 1cc8d529..8ca4fe13 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -140,7 +140,7 @@ def _fix_for_corrupt_data(x, feature_names): def _get_data(subchunk, config): - features_names = geoio.feature_names(config) + features_names = features.feature_names(config) # NOTE: This returns an *untransformed* x, # which is ok as we just need dummies here diff --git a/uncoverml/validate.py b/uncoverml/validate.py index ccae96a8..51ec1c49 100644 --- a/uncoverml/validate.py +++ b/uncoverml/validate.py @@ -281,7 +281,7 @@ def permutation_importance(model, x_all, targets_all, config: Config): refit=False).fit, data=(x_all, y) ) - feature_names = geoio.feature_names(config) + feature_names = feat.feature_names(config) df_picv = eli5.explain_weights_df( pi_cv, feature_names=feature_names, top=100) csv = Path(config.output_dir).joinpath( @@ -573,7 +573,7 @@ def plot_permutation_feature_importance(model, x_all, targets_all, conf: Config, data=(x_all, y), model=model ) - feature_names = [Path(f).stem for f in geoio.feature_names(conf)] + feature_names = [Path(f).stem for f in feat.feature_names(conf)] df_picv = eli5.explain_weights_df( pi_cv, feature_names=feature_names, top=100) csv = Path(conf.output_dir).joinpath(