diff --git a/flask_app/analyst/compare.py b/flask_app/analyst/compare.py index 5d5f29f..e469694 100644 --- a/flask_app/analyst/compare.py +++ b/flask_app/analyst/compare.py @@ -122,6 +122,7 @@ def _get_comparative_measures(cls, summary_type, summary_key): # ............................................................................. if __name__ == "__main__": dataset_key = "0000e36f-d0e9-46b0-aa23-cc1980f00515" + dataset_key = "3e2d26d9-2776-4bec-bdc7-bab3842ffb6b" species_key = "11378306 Phaneroptera laticerca" svc = CompareSvc() response = svc.get_endpoint() diff --git a/sppy/tools/s2n/aggregate_data_matrix.py b/sppy/tools/s2n/aggregate_data_matrix.py index 4eeaeac..16d7908 100644 --- a/sppy/tools/s2n/aggregate_data_matrix.py +++ b/sppy/tools/s2n/aggregate_data_matrix.py @@ -18,7 +18,7 @@ def __init__(self, table_type, data_datestr, logger=None): """Constructor for species by dataset comparisons. Args: - table_type (aws_constants.SUMMARY_TABLE_TYPES): type of aggregated data + table_type (sppy.tools.s2n.SUMMARY_TABLE_TYPES): type of aggregated data data_datestr (str): date of the source data in YYYY_MM_DD format. logger (object): An optional local logger to use for logging output with consistent options diff --git a/sppy/tools/s2n/constants.py b/sppy/tools/s2n/constants.py index d4e8d02..0a79043 100644 --- a/sppy/tools/s2n/constants.py +++ b/sppy/tools/s2n/constants.py @@ -38,6 +38,7 @@ class Summaries: Note: Table code is the same as _ Note: "datasetkey" is the original GBIF field """ + TABLES = { SUMMARY_TABLE_TYPES.DATASET_COUNTS: { "code": SUMMARY_TABLE_TYPES.DATASET_COUNTS, @@ -105,7 +106,6 @@ class Summaries: "value": "measure", } } - # ............................................... @classmethod def update_summary_tables(cls, datestr): @@ -251,35 +251,46 @@ class SNKeys(Enum): # Column: type of aggregation (COL_TYPE,) = range(5000, 5001) # Column: One x - (COL_IDX, COL_LABEL, COL_COUNT, COL_TOTAL, - COL_MIN_COUNT, COL_MIN_LABELS, COL_MIN_INDEXES, - COL_MAX_COUNT, COL_MAX_LABELS, COL_MAX_INDEXES - ) = range(5100, 5110) + (COL_LABEL, COL_COUNT, COL_TOTAL, + COL_MIN_TOTAL, COL_MIN_TOTAL_NUMBER, COL_MAX_TOTAL, COL_MAX_TOTAL_LABELS, + ) = range(5100, 5107) # Column: All x - (COLS_TOTAL, COLS_MIN, COLS_MAX, COLS_MEAN, COLS_MEDIAN, - COLS_COUNT, COLS_COUNT_MIN, COLS_COUNT_MAX, COLS_COUNT_MEAN, COLS_COUNT_MEDIAN - ) = range(5200, 5210) + (COLS_TOTAL, + COLS_MIN_TOTAL, COLS_MIN_TOTAL_NUMBER, COLS_MEAN_TOTAL, COLS_MEDIAN_TOTAL, + COLS_MAX_TOTAL, COLS_MAX_TOTAL_LABELS, + COLS_COUNT, + COLS_MIN_COUNT, COLS_MIN_COUNT_NUMBER, COLS_MEAN_COUNT, COLS_MEDIAN_COUNT, + COLS_MAX_COUNT, COLS_MAX_COUNT_LABELS + ) = range(5200, 5214) # Row: aggregation of what type of data (ROW_TYPE,) = range(6000, 6001) # Row: One y - (ROW_IDX, ROW_LABEL, ROW_COUNT, ROW_TOTAL, - ROW_MIN_COUNT, ROW_MIN_LABELS, ROW_MIN_INDEXES, - ROW_MAX_COUNT, ROW_MAX_LABELS, ROW_MAX_INDEXES - ) = range(6100, 6110) - (ROWS_TOTAL, ROWS_MIN, ROWS_MAX, ROWS_MEAN, ROWS_MEDIAN, - ROWS_COUNT, ROWS_COUNT_MIN, ROWS_COUNT_MAX, ROWS_COUNT_MEAN, ROWS_COUNT_MEDIAN - ) = range(6200, 6210) + (ROW_LABEL, ROW_COUNT, ROW_TOTAL, + ROW_MIN_TOTAL, ROW_MIN_TOTAL_NUMBER, ROW_MAX_TOTAL, ROW_MAX_TOTAL_LABELS, + ) = range(6100, 6107) + # Rows: All y + (ROWS_TOTAL, + ROWS_MIN_TOTAL, ROWS_MIN_TOTAL_NUMBER, ROWS_MEAN_TOTAL, ROWS_MEDIAN_TOTAL, + ROWS_MAX_TOTAL, ROWS_MAX_TOTAL_LABELS, + ROWS_COUNT, + ROWS_MIN_COUNT, ROWS_MIN_COUNT_NUMBER, ROWS_MEAN_COUNT, ROWS_MEDIAN_COUNT, + ROWS_MAX_COUNT, ROWS_MAX_COUNT_LABELS + ) = range(6200, 6214) # Type of aggregation (TYPE,) = range(0, 1) # One field of row/column header - (ONE_IDX, ONE_LABEL, ONE_COUNT, ONE_TOTAL, - ONE_MIN_COUNT, ONE_MIN_LABELS, ONE_MIN_INDEXES, - ONE_MAX_COUNT, ONE_MAX_LABELS, ONE_MAX_INDEXES - ) = range(100, 110) + (ONE_LABEL, ONE_COUNT, ONE_TOTAL, + ONE_MIN_COUNT, ONE_MIN_COUNT_NUMBER, + ONE_MAX_COUNT, ONE_MAX_COUNT_LABELS + ) = range(100, 107) # Column: All row/column headers - (ALL_TOTAL, ALL_MIN, ALL_MAX, ALL_MEAN, ALL_MEDIAN, - ALL_COUNT, ALL_COUNT_MIN, ALL_COUNT_MAX, ALL_COUNT_MEAN, ALL_COUNT_MEDIAN - ) = range(200, 210) + (ALL_TOTAL, + ALL_MIN_TOTAL, ALL_MIN_TOTAL_NUMBER, ALL_MEAN_TOTAL, ALL_MEDIAN_TOTAL, + ALL_MAX_TOTAL, ALL_MAX_TOTAL_LABELS, + ALL_COUNT, + ALL_MIN_COUNT, ALL_MIN_COUNT_NUMBER, ALL_MEAN_COUNT, ALL_MEDIAN_COUNT, + ALL_MAX_COUNT, ALL_MAX_COUNT_LABELS, + ) = range(200, 214) @classmethod def get_keys_for_table(cls, table_type): @@ -302,77 +313,76 @@ def get_keys_for_table(cls, table_type): # ----------------------------- cls.COL_TYPE: "dataset", # One dataset - cls.COL_IDX: "dataset_index", cls.COL_LABEL: "dataset_label", # Count (non-zero elements in column) cls.COL_COUNT: "total_species_for_dataset", # Values (total of values in column) cls.COL_TOTAL: "total_occurrences_for_dataset", - # Values: Minimum occurrence count for one dataset, species labels, indexes - cls.COL_MIN_COUNT: "min_occurrence_count_for_dataset", - cls.COL_MIN_LABELS: "species_with_min_occurrence_count_for_dataset", - cls.COL_MIN_INDEXES: "species_indexes_with_min_occurrence_count_for_dataset", - # Values: Maximum occurrence count for one dataset, species labels, indexes - cls.COL_MAX_COUNT: "max_occurrence_count_for_dataset", - cls.COL_MAX_LABELS: "species_with_max_occurrence_count_for_dataset", - cls.COL_MAX_INDEXES: "species_indexes_with_max_occurrence_count_for_dataset", + # Values: Minimum occurrences for one dataset, species labels + cls.COL_MIN_TOTAL: "min_occurrences_for_dataset", + cls.COL_MIN_TOTAL_NUMBER: "number_of_species_with_min_occurrences_for_dataset", + # Values: Maximum occurrence count for one dataset, species labels + cls.COL_MAX_TOTAL: "max_occurrences_for_dataset", + cls.COL_MAX_TOTAL_LABELS: "species_with_max_occurrences_for_dataset", # ----------------------------- # All datasets # ------------ - # COMPARES TO: cls.COL_TOTAL: "total_occurrences_for_dataset", # Values: Total of all occurrences for all datasets - stats - cls.COLS_TOTAL: "total_occurrences_for_all_datasets", - cls.COLS_MIN: "min_occurrences_for_all_datasets", - cls.COLS_MAX: "max_occurrences_for_all_datasets", - cls.COLS_MEAN: "mean_occurrences_for_all_datasets", - cls.COLS_MEDIAN: "median_occurrences_for_all_datasets", + cls.COLS_TOTAL: "total_occurrences_of_all_datasets", + cls.COLS_MIN_TOTAL: "min_occurrences_of_all_datasets", + cls.COLS_MIN_TOTAL_NUMBER: "number_of_datasets_with_min_occurrences_of_all", + cls.COLS_MEAN_TOTAL: "mean_occurrences_of_all_datasets", + cls.COLS_MEDIAN_TOTAL: "median_occurrences_of_all_datasets", + cls.COLS_MAX_TOTAL: "max_occurrences_of_all_datasets", + cls.COLS_MAX_TOTAL_LABELS: "datasets_with_max_occurrences_of_all", # ------------ - # COMPARES TO: cls.COL_COUNT: "total_species_for_dataset", # Counts: Count of all species (from all columns/datasets) - cls.COLS_COUNT: "total_species_count", + cls.COLS_COUNT: "total_dataset_count", # Species counts for all datasets - stats - cls.COLS_COUNT_MIN: "min_species_count_for_all_datasets", - cls.COLS_COUNT_MAX: "max_species_count_for_all_datasets", - cls.COLS_COUNT_MEAN: "mean_species_count_for_all_datasets", - cls.COLS_COUNT_MEDIAN: "median_species_count_for_all_datasets", + cls.COLS_MIN_COUNT: "min_species_count_of_all_datasets", + cls.COLS_MIN_COUNT_NUMBER: "number_of_datasets_with_min_species_count_of_all", + cls.COLS_MEAN_COUNT: "mean_species_count_of_all_datasets", + cls.COLS_MEDIAN_COUNT: "median_species_count_of_all_datasets", + cls.COLS_MAX_COUNT: "max_species_count_of_all_datasets", + cls.COLS_MAX_COUNT_LABELS: "datasets_with_max_species_count_of_all", # ---------------------------------------------------------------------- # Row # ----------------------------- cls.ROW_TYPE: "species", # One species - cls.ROW_IDX: "species_index", cls.ROW_LABEL: "species_label", # Count (non-zero elements in row) cls.ROW_COUNT: "total_datasets_for_species", # Values (total of values in row) cls.ROW_TOTAL: "total_occurrences_for_species", # Values: Minimum occurrence count for one species, dataset labels, indexes - cls.ROW_MIN_COUNT: "min_occurrence_count_for_species", - cls.ROW_MIN_LABELS: "datasets_with_min_count_for_species", - cls.ROW_MIN_INDEXES: "dataset_indexes_with_min_count_for_species", + cls.ROW_MIN_TOTAL: "min_occurrences_for_species", # Values: Maximum occurrence count for one species, dataset labels, indexes - cls.ROW_MAX_COUNT: "max_occurrence_count_for_species", - cls.ROW_MAX_LABELS: "datasets_with_max_count_for_species", - cls.ROW_MAX_INDEXES: "dataset_indexes_with_max_count_for_species", + cls.ROW_MAX_TOTAL: "max_occurrences_for_species", + cls.ROW_MAX_TOTAL_LABELS: "datasets_with_max_occurrences_for_species", # ----------------------------- # All species # ------------ # COMPARES TO: cls.ROW_TOTAL: "total_occurrences_for_species", # Values: Total of all occurrences for all species - stats - cls.ROWS_TOTAL: "total_occurrences_for_all_species", - cls.ROWS_MIN: "min_occurrences_for_all_species", - cls.ROWS_MAX: "max_occurrences_for_all_species", - cls.ROWS_MEAN: "mean_occurrences_for_all_species", - cls.ROWS_MEDIAN: "median_occurrences_for_all_species", + cls.ROWS_TOTAL: "total_occurrences_of_all_species", + cls.ROWS_MIN_TOTAL: "min_occurrences_of_all_species", + cls.ROWS_MIN_TOTAL_NUMBER: "number_of_species_with_max_occurrences_of_all", + cls.ROWS_MEAN_TOTAL: "mean_occurrences_of_all_species", + cls.ROWS_MEDIAN_TOTAL: "median_occurrences_of_all_species", + cls.ROWS_MAX_TOTAL: "max_occurrences_of_all_species", + cls.ROWS_MAX_TOTAL_LABELS: "species_with_max_occurrences_of_all", # ------------ # COMPARES TO: cls.ROW_COUNT: "total_datasets_for_species", # Counts: Count of all datasets (from all rows/species) - cls.ROWS_COUNT: "total_dataset_count", + cls.ROWS_COUNT: "total_species_count", # Dataset counts for all species - stats - cls.ROWS_COUNT_MIN: "min_dataset_count_for_all_species", - cls.ROWS_COUNT_MAX: "max_dataset_count_for_all_species", - cls.ROWS_COUNT_MEAN: "mean_dataset_count_for_all_species", - cls.ROWS_COUNT_MEDIAN: "median_dataset_count_for_all_species", + cls.ROWS_MIN_COUNT: "min_dataset_count_of_all_species", + cls.ROWS_MIN_COUNT_NUMBER: "species_with_min_dataset_count_of_all", + cls.ROWS_MEAN_COUNT: "mean_dataset_count_of_all_species", + cls.ROWS_MEDIAN_COUNT: "median_dataset_count_of_all_species", + cls.ROWS_MAX_COUNT: "max_dataset_count_of_all_species", + cls.ROWS_MAX_COUNT_LABELS: "species_with_max_dataset_count_of_all", } elif table_type == SUMMARY_TABLE_TYPES.DATASET_SPECIES_SUMMARY: keys = { @@ -381,81 +391,39 @@ def get_keys_for_table(cls, table_type): # ----------------------------- cls.TYPE: "dataset", # One dataset - cls.ONE_IDX: "dataset_index", cls.ONE_LABEL: "dataset_label", # Count (non-zero elements in column) cls.ONE_COUNT: "total_species_for_dataset", # Values (total of values in column) cls.ONE_TOTAL: "total_occurrences_for_dataset", - # Values: Minimum occurrence count for one dataset, species labels, indexes - cls.ONE_MIN_COUNT: "min_occurrence_count_for_dataset", - cls.ONE_MIN_LABELS: "species_with_min_occurrence_count_for_dataset", - cls.ONE_MIN_INDEXES: "species_indexes_with_min_occurrence_count_for_dataset", + # Values: Minimum occurrence count for one dataset + cls.ONE_MIN_COUNT: "min_occurrences_for_dataset", # Values: Maximum occurrence count for one dataset, species labels, indexes - cls.ONE_MAX_COUNT: "max_occurrence_count_for_dataset", - cls.ONE_MAX_LABELS: "species_with_max_occurrence_count_for_dataset", - cls.ONE_MAX_INDEXES: "species_indexes_with_max_occurrence_count_for_dataset", + cls.ONE_MAX_COUNT: "max_occurrences_for_dataset", + cls.ONE_MAX_COUNT_LABELS: "datasets_with_max_occurrences", # ----------------------------- # All datasets # ------------ # COMPARES TO: cls.ONE_TOTAL: "total_occurrences_for_dataset", # Values: Total of all occurrences for all datasets - stats - cls.ALL_TOTAL: "total_occurrences_for_all_datasets", - cls.ALL_MIN: "min_occurrences_for_all_datasets", - cls.ALL_MAX: "max_occurrences_for_all_datasets", - cls.ALL_MEAN: "mean_occurrences_for_all_datasets", - cls.ALL_MEDIAN: "median_occurrences_for_all_datasets", + cls.ALL_TOTAL: "total_occurrences_of_all_datasets", + cls.ALL_MIN_TOTAL: "min_occurrences_of_all_datasets", + cls.ALL_MEAN_TOTAL: "mean_occurrences_of_all_datasets", + cls.ALL_MEDIAN_TOTAL: "median_occurrences_of_all_datasets", + cls.ALL_MAX_TOTAL: "max_occurrences_of_all_datasets", # ------------ # COMPARES TO: cls.ONE_COUNT: "total_species_for_dataset", # Counts: Count of all species (from all columns/datasets) cls.ALL_COUNT: "total_species_count", # Species counts for all datasets - stats - cls.ALL_COUNT_MIN: "min_species_count_for_all_datasets", - cls.ALL_COUNT_MAX: "max_species_count_for_all_datasets", - cls.ALL_COUNT_MEAN: "mean_species_count_for_all_datasets", - cls.ALL_COUNT_MEDIAN: "median_species_count_for_all_datasets", - } - elif table_type == SUMMARY_TABLE_TYPES.SPECIES_DATASET_SUMMARY: - keys = { - # ---------------------------------------------------------------------- - # Row - # ----------------------------- - cls.TYPE: "species", - # One species - cls.ONE_IDX: "species_index", - cls.ONE_LABEL: "species_label", - # Count (non-zero elements in row) - cls.ONE_COUNT: "total_datasets_for_species", - # Values (total of values in row) - cls.ONE_TOTAL: "total_occurrences_for_species", - # Values: Minimum occurrence count for one species, dataset labels, indexes - cls.ONE_MIN_COUNT: "min_occurrence_count_for_species", - cls.ONE_MIN_LABELS: "datasets_with_min_count_for_species", - cls.ONE_MIN_INDEXES: "dataset_indexes_with_min_count_for_species", - # Values: Maximum occurrence count for one species, dataset labels, indexes - cls.ONE_MAX_COUNT: "max_occurrence_count_for_species", - cls.ONE_MAX_LABELS: "datasets_with_max_count_for_species", - cls.ONE_MAX_INDEXES: "dataset_indexes_with_max_count_for_species", - # ----------------------------- - # All species - # ------------ - # COMPARES TO: cls.ONE_TOTAL: "total_occurrences_for_species", - # Values: Total of all occurrences for all species - stats - cls.ALL_TOTAL: "total_occurrences_for_all_species", - cls.ALL_MIN: "min_occurrences_for_all_species", - cls.ALL_MAX: "max_occurrences_for_all_species", - cls.ALL_MEAN: "mean_occurrences_for_all_species", - cls.ALL_MEDIAN: "median_occurrences_for_all_species", - # ------------ - # COMPARES TO: cls.ONE_COUNT: "total_datasets_for_species", - # Counts: Count of all datasets (from all rows/species) - cls.ALL_COUNT: "total_dataset_count", - # Dataset counts for all species - stats - cls.ALL_COUNT_MIN: "min_dataset_count_for_all_species", - cls.ALL_COUNT_MAX: "max_dataset_count_for_all_species", - cls.ALL_COUNT_MEAN: "mean_dataset_count_for_all_species", - cls.ALL_COUNT_MEDIAN: "median_dataset_count_for_all_species", + cls.ALL_MIN_COUNT: "min_species_count_of_all_datasets", + cls.ALL_MEAN_COUNT: "mean_species_count_of_all_datasets", + cls.ALL_MEDIAN_COUNT: "median_species_count_of_all_datasets", + cls.ALL_MAX_COUNT: "max_species_count_of_all_datasets", } + # elif table_type == SUMMARY_TABLE_TYPES.SPECIES_DATASET_SUMMARY: + # keys = { + # } else: raise Exception(f"Keys not defined for table {table_type}") return keys diff --git a/sppy/tools/s2n/sparse_matrix.py b/sppy/tools/s2n/sparse_matrix.py index d4b6731..1b9fde8 100644 --- a/sppy/tools/s2n/sparse_matrix.py +++ b/sppy/tools/s2n/sparse_matrix.py @@ -6,8 +6,10 @@ import random import scipy.sparse +from sppy.aws.aws_constants import PROJ_BUCKET, DATASET_GBIF_KEY from sppy.tools.s2n.aggregate_data_matrix import _AggregateDataMatrix from sppy.tools.s2n.constants import (SNKeys, Summaries) +from sppy.tools.s2n.spnet import SpNetAnalyses # ............................................................................. @@ -24,7 +26,8 @@ def __init__( sparse_coo_array (scipy.sparse.coo_array): A 2d sparse array with count values for one aggregator0 (i.e. species) rows (axis 0) by another aggregator1 (i.e. dataset) columns (axis 1) to use for computations. - table_type (aws_constants.SUMMARY_TABLE_TYPES): type of aggregated data + table_type (sppy.tools.s2n.constants.SUMMARY_TABLE_TYPES): type of + aggregated data data_datestr (str): date of the source data in YYYY_MM_DD format. row_category (pandas.api.types.CategoricalDtype): ordered row labels used to identify axis 0/rows. @@ -59,8 +62,8 @@ def init_from_stacked_data( (axis 0) val_fld: : column in the input dataframe containing values to be used as values for the intersection of x and y fields - table_type (aws_constants.SUMMARY_TABLE_TYPES): table type of sparse matrix - aggregated data + table_type (sppy.tools.s2n.constants.SUMMARY_TABLE_TYPES): table type of + sparse matrix aggregated data data_datestr (str): date of the source data in YYYY_MM_DD format. logger (object): logger for saving relevant processing messages @@ -322,7 +325,7 @@ def get_row_labels_for_data_in_column(self, col, value=None): # ............................................... def get_extreme_val_labels_for_vector(self, vector, axis=0, is_max=True): - """Get the minimum or maximum NON-ZERO value and row label(s) for a column. + """Get the minimum or maximum NON-ZERO value and axis label(s) for a vecto. Args: vector (numpy.array): 1 dimensional array for a row or column. @@ -344,8 +347,31 @@ def get_extreme_val_labels_for_vector(self, vector, axis=0, is_max=True): target = vals.min() target = self.convert_np_vals_for_json(target) + # Get labels for this value in + labels = self.get_labels_for_val_in_vector(vector, target, axis=axis) + return target, labels + + # ............................................... + def get_labels_for_val_in_vector(self, vector, target_val, axis=0): + """Get the row or column label(s) for a vector containing target_val. + + Args: + vector (numpy.array): 1 dimensional array for a row or column. + target_val (int): value to search for in a row or column + axis (int): row (0) or column (1) header for extreme value and labels. + + Returns: + target: The minimum or maximum value for a column + row_labels: The labels of the rows containing the target value + + Raises: + Exception: on axis not in (0, 1) + """ + # Returns row_idxs, col_idxs, vals of NNZ values in row + row_idxs, col_idxs, vals = scipy.sparse.find(vector) + # Get indexes of target value within NNZ vals - tmp_idxs = np.where(vals == target)[0] + tmp_idxs = np.where(vals == target_val)[0] tmp_idx_lst = [tmp_idxs[i] for i in range(len(tmp_idxs))] # Get actual indexes (within all zero/non-zero elements) of target in vector if axis == 0: @@ -363,7 +389,31 @@ def get_extreme_val_labels_for_vector(self, vector, axis=0, is_max=True): # Convert from indexes to labels labels = [ self._get_category_from_code(idx, axis=label_axis) for idx in idxs_lst] - return target, labels + return labels + + # ............................................... + def count_val_in_vector(self, vector, target_val): + """Count the row or columns containing target_val in a vector. + + Args: + vector (numpy.array): 1 dimensional array for a row or column. + target_val (int): value to search for in a row or column + axis (int): row (0) or column (1) header for extreme value and labels. + + Returns: + target: The minimum or maximum value for a column + row_labels: The labels of the rows containing the target value + + Raises: + Exception: on axis not in (0, 1) + """ + # Returns row_idxs, col_idxs, vals of NNZ values in row + row_idxs, col_idxs, vals = scipy.sparse.find(vector) + # Get indexes of target value within NNZ vals + tmp_idxs = np.where(vals == target_val)[0] + tmp_idx_lst = [tmp_idxs[i] for i in range(len(tmp_idxs))] + count = len(tmp_idx_lst) + return count # ............................................... def get_row_stats(self, row_label=None): @@ -409,24 +459,28 @@ def get_one_row_stats(self, row_label): row, row_idx = self.get_vector_from_label(row_label, axis=0) except IndexError: raise - # Largest Occurrence count for this Species, and datasets that contain it - maxval, max_row_labels = self.get_extreme_val_labels_for_vector( + # Largest/smallest Occurrence count for this Species, and column (dataset) + # labels that contain it + maxval, max_col_labels = self.get_extreme_val_labels_for_vector( row, axis=0, is_max=True) - minval, min_row_labels = self.get_extreme_val_labels_for_vector( + minval, min_col_labels = self.get_extreme_val_labels_for_vector( row, axis=0, is_max=False) + # Get dataset labels, if column is dataset, for datasets with max occurrences + # of species. Datasets with only 1 occurrence is often large number + names = self._lookup_dataset_names(max_col_labels) + stats = { - self._keys[SNKeys.ROW_IDX]: row_idx, self._keys[SNKeys.ROW_LABEL]: row_label, # Total Occurrences for this Species self._keys[SNKeys.ROW_TOTAL]: self.convert_np_vals_for_json(row.sum()), # Count of Datasets containing this Species self._keys[SNKeys.ROW_COUNT]: self.convert_np_vals_for_json(row.nnz), # Return min/max count in this species and datasets for that count - self._keys[SNKeys.ROW_MIN_COUNT]: minval, - # TODO: is there a good way to optionally return many labels - self._keys[SNKeys.ROW_MAX_COUNT]: maxval, - self._keys[SNKeys.ROW_MAX_LABELS]: max_row_labels, + self._keys[SNKeys.ROW_MIN_TOTAL]: minval, + self._keys[SNKeys.ROW_MAX_TOTAL]: maxval, + self._keys[SNKeys.ROW_MAX_TOTAL_LABELS]: names } + return stats # ............................................... @@ -437,34 +491,60 @@ def get_all_row_stats(self): all_row_stats (dict): counts and statistics about all rows. (numpy.ndarray): array of totals of all rows. """ - # Sum all rows to return a column (axis=1) + # Sum all rows to return a column (axis=1) of species totals all_totals = self._coo_array.sum(axis=1) + # Min total and rows that contain it + min_total = all_totals.min() + min_total_number = self.count_val_in_vector(all_totals, min_total) + # Max total and rows that contain that + max_total = all_totals.max() + # Get species names for largest number of occurrences + max_total_labels = self.get_labels_for_val_in_vector( + all_totals, max_total, axis=1) + # Get number of non-zero entries for every row (column, numpy.ndarray) all_counts = self._coo_array.getnnz(axis=1) + min_count = all_counts.min() + min_count_number = self.count_val_in_vector(all_counts, min_count) + max_count = all_counts.max() + max_count_labels = self.get_labels_for_val_in_vector( + all_counts, max_count, axis=1) + # Count columns with at least one non-zero entry (all columns) - row_count = self._coo_array.shape[1] + row_count = self._coo_array.shape[0] all_row_stats = { + # Count of other axis self._keys[SNKeys.ROWS_COUNT]: row_count, + self._keys[SNKeys.ROWS_MIN_COUNT]: + self.convert_np_vals_for_json(min_count), + self._keys[SNKeys.ROWS_MIN_TOTAL_NUMBER]: min_count_number, + + self._keys[SNKeys.ROWS_MEAN_COUNT]: + self.convert_np_vals_for_json(all_counts.mean()), + self._keys[SNKeys.ROWS_MEDIAN_COUNT]: + self.convert_np_vals_for_json(np.median(all_counts, axis=0)), + + self._keys[SNKeys.ROWS_MAX_COUNT]: + self.convert_np_vals_for_json(max_count), + self._keys[SNKeys.ROWS_MAX_COUNT_LABELS]: max_count_labels, + + # Total of values self._keys[SNKeys.ROWS_TOTAL]: self.convert_np_vals_for_json(all_totals.sum()), - self._keys[SNKeys.ROWS_MIN]: - self.convert_np_vals_for_json(all_totals.min()), - self._keys[SNKeys.ROWS_MAX]: - self.convert_np_vals_for_json(all_totals.max()), - self._keys[SNKeys.ROWS_MEAN]: + self._keys[SNKeys.ROWS_MIN_TOTAL]: + self.convert_np_vals_for_json(min_total), + self._keys[SNKeys.ROWS_MIN_TOTAL]: min_total_number, + + self._keys[SNKeys.ROWS_MEAN_TOTAL]: self.convert_np_vals_for_json(all_totals.mean()), - self._keys[SNKeys.ROWS_MEDIAN]: self.convert_np_vals_for_json( + self._keys[SNKeys.ROWS_MEDIAN_TOTAL]: self.convert_np_vals_for_json( np.median(all_totals, axis=0)[0, 0]), - self._keys[SNKeys.ROWS_COUNT_MIN]: - self.convert_np_vals_for_json(all_counts.min()), - self._keys[SNKeys.ROWS_COUNT_MAX]: - self.convert_np_vals_for_json(all_counts.max()), - self._keys[SNKeys.ROWS_COUNT_MEAN]: - self.convert_np_vals_for_json(all_counts.mean()), - self._keys[SNKeys.ROWS_COUNT_MEDIAN]: - self.convert_np_vals_for_json(np.median(all_counts, axis=0)), + self._keys[SNKeys.ROWS_MAX_TOTAL]: + self.convert_np_vals_for_json(max_total), + self._keys[SNKeys.ROW_MAX_TOTAL_LABELS]: max_total_labels, } + return all_row_stats # ............................................... @@ -506,36 +586,50 @@ def get_one_column_stats(self, col_label): Inline comments are specific to a SUMMARY_TABLE_TYPES.SPECIES_DATASET_MATRIX with row/column/value = species/dataset/occ_count """ + stats = {} # Get column (sparse array), and its index try: col, col_idx = self.get_vector_from_label(col_label, axis=1) except IndexError: raise - stats = { - self._keys[SNKeys.COL_IDX]: col_idx, - self._keys[SNKeys.COL_LABEL]: col_label, - } - # Count of non-zero rows (Species) within this column (Dataset) - stats[self._keys[SNKeys.COL_COUNT]] = self.convert_np_vals_for_json(col.nnz) - # Largest/smallest occ count for dataset (column), and species (row) - # containing that count - maxval, max_col_labels = self.get_extreme_val_labels_for_vector( + # Largest/smallest occ count for dataset (column), and species (row) labels + # containing that count. + maxval, max_row_labels = self.get_extreme_val_labels_for_vector( col, axis=1, is_max=True) - minval, min_col_labels = self.get_extreme_val_labels_for_vector( + minval, min_row_labels = self.get_extreme_val_labels_for_vector( col, axis=1, is_max=False) + # Add dataset titles if column label contains dataset_keys/GUIDs + name = self._lookup_dataset_names([col_label]) + if isinstance(name, dict): + stats[self._keys[SNKeys.COL_LABEL]] = name + else: + stats[self._keys[SNKeys.COL_LABEL]] = col_label + + # Count of non-zero rows (Species) within this column (Dataset) + stats[self._keys[SNKeys.COL_COUNT]] = self.convert_np_vals_for_json(col.nnz) # Total Occurrences for Dataset stats[self._keys[SNKeys.COL_TOTAL]] = self.convert_np_vals_for_json(col.sum()) # Return min occurrence count in this dataset - stats[self._keys[SNKeys.COL_MIN_COUNT]] = self.convert_np_vals_for_json(minval) + stats[self._keys[SNKeys.COL_MIN_TOTAL]] = self.convert_np_vals_for_json(minval) # Return number of species containing same minimum count (too many to list) - stats[self._keys[SNKeys.COL_MIN_LABELS]] = len(min_col_labels) + stats[self._keys[SNKeys.COL_MIN_TOTAL_NUMBER]] = len(min_row_labels) # Return max occurrence count in this dataset - stats[self._keys[SNKeys.COL_MAX_COUNT]] = self.convert_np_vals_for_json(maxval) + stats[self._keys[SNKeys.COL_MAX_TOTAL]] = self.convert_np_vals_for_json(maxval) # Return species containing same maximum count - stats[self._keys[SNKeys.COL_MAX_LABELS]] = max_col_labels + stats[self._keys[SNKeys.COL_MAX_TOTAL_LABELS]] = max_row_labels + return stats + # ............................................... + def _lookup_dataset_names(self, labels): + if self._table["column"] != DATASET_GBIF_KEY: + names = labels + else: + spnet = SpNetAnalyses(PROJ_BUCKET) + names = spnet.lookup_dataset_names(labels) + return names + # ............................................... def get_all_column_stats(self): """Return stats (min, max, mean, median) of totals and counts for all columns. @@ -543,33 +637,60 @@ def get_all_column_stats(self): Returns: all_col_stats (dict): counts and statistics about all columns. """ - # Sum all rows for each column to return a row (numpy.ndarray, axis=0) + # Sum all rows for each column to return a row (numpy.matrix, axis=0) all_totals = self._coo_array.sum(axis=0) + # Min total and columns that contain it + min_total = all_totals.min() + min_total_number = self.count_val_in_vector(all_totals, min_total) + # Max total and columns that contain it + max_total = all_totals.max() + max_total_labels = self.get_labels_for_val_in_vector( + all_totals, max_total, axis=0) + max_total_names = self._lookup_dataset_names(max_total_labels) + # Get number of non-zero rows for every column (row, numpy.ndarray) all_counts = self._coo_array.getnnz(axis=0) + # Min count and columns that contain that + min_count = all_counts.min() + min_count_number = self.count_val_in_vector(all_counts, min_count) + # Max count and columns that contain that + max_count = all_counts.max() + max_count_labels = self.get_labels_for_val_in_vector( + all_counts, max_count, axis=0) + max_count_names = self._lookup_dataset_names(max_count_labels) + # Count rows with at least one non-zero entry (all rows) - col_count = self._coo_array.shape[0] + col_count = self._coo_array.shape[1] all_col_stats = { + # Count of other axis self._keys[SNKeys.COLS_COUNT]: col_count, + self._keys[SNKeys.COLS_MIN_COUNT]: + self.convert_np_vals_for_json(min_count), + self._keys[SNKeys.COLS_MIN_COUNT_NUMBER]: min_count_number, + + self._keys[SNKeys.COLS_MEAN_COUNT]: + self.convert_np_vals_for_json(all_counts.mean()), + self._keys[SNKeys.COLS_MEDIAN_COUNT]: + self.convert_np_vals_for_json(np.median(all_counts, axis=0)), + + self._keys[SNKeys.COLS_MAX_COUNT]: + self.convert_np_vals_for_json(max_count), + self._keys[SNKeys.COLS_MAX_COUNT_LABELS]: max_count_names, + + # Total occurrences self._keys[SNKeys.COLS_TOTAL]: self.convert_np_vals_for_json(all_totals.sum()), - self._keys[SNKeys.COLS_MIN]: - self.convert_np_vals_for_json(all_totals.min()), - self._keys[SNKeys.COLS_MAX]: - self.convert_np_vals_for_json(all_totals.max()), - self._keys[SNKeys.COLS_MEAN]: + self._keys[SNKeys.COLS_MIN_TOTAL]: + self.convert_np_vals_for_json(min_total), + self._keys[SNKeys.COLS_MIN_TOTAL_NUMBER]: min_total_number, + + self._keys[SNKeys.COLS_MEAN_TOTAL]: self.convert_np_vals_for_json(all_totals.mean()), - self._keys[SNKeys.COLS_MEDIAN]: + self._keys[SNKeys.COLS_MEDIAN_TOTAL]: self.convert_np_vals_for_json(np.median(all_totals, axis=1)[0, 0]), - self._keys[SNKeys.COLS_COUNT_MIN]: - self.convert_np_vals_for_json(all_counts.min()), - self._keys[SNKeys.COLS_COUNT_MAX]: - self.convert_np_vals_for_json(all_counts.max()), - self._keys[SNKeys.COLS_COUNT_MEAN]: - self.convert_np_vals_for_json(all_counts.mean()), - self._keys[SNKeys.COLS_COUNT_MEDIAN]: - self.convert_np_vals_for_json(np.median(all_counts, axis=0)), + self._keys[SNKeys.COLS_MAX_TOTAL]: self.convert_np_vals_for_json(max_total), + self._keys[SNKeys.COLS_MAX_TOTAL_LABELS]: max_total_names, } return all_col_stats @@ -748,7 +869,8 @@ def uncompress_zipped_data( sparse_coo (scipy.sparse.coo_array): Sparse Matrix containing data. row_categ (pandas.api.types.CategoricalDtype): row categories col_categ (pandas.api.types.CategoricalDtype): column categories - table_type (aws.aws_constants.SUMMARY_TABLE_TYPES): type of table data + table_type (sppy.tools.s2n.constants.SUMMARY_TABLE_TYPES): type of table + data data_datestr (str): date string in format YYYY_MM_DD Raises: @@ -786,7 +908,8 @@ def read_data(cls, mtx_filename, meta_filename): sparse_coo (scipy.sparse.coo_array): Sparse Matrix containing data. row_categ (pandas.api.types.CategoricalDtype): row categories col_categ (pandas.api.types.CategoricalDtype): column categories - table_type (aws.aws_constants.SUMMARY_TABLE_TYPES): type of table data + table_type (sppy.tools.s2n.constants.SUMMARY_TABLE_TYPES): type of table + data data_datestr (str): date string in format YYYY_MM_DD Raises: diff --git a/sppy/tools/s2n/spnet.py b/sppy/tools/s2n/spnet.py index 6916861..1fa4895 100644 --- a/sppy/tools/s2n/spnet.py +++ b/sppy/tools/s2n/spnet.py @@ -204,58 +204,68 @@ def get_simple_dataset_counts(self, dataset_key, format="JSON"): query_str = ( f"SELECT * FROM s3object s WHERE s.{table['key_fld']} = '{dataset_key}'" ) - # Returns empty list or list of 1 record + # Returns empty list or list of 1 record dict records = self._query_summary_table(table, query_str, format) if self._dataset_metadata_exists(): self._add_dataset_lookup_vals(records, table, format) return records + # ............................................... + def lookup_dataset_names(self, labels): + names = {} + for lbl in labels: + names[lbl] = None + if not (isinstance(labels, list) or isinstance(labels, tuple)): + labels = [labels] + try: + ds_meta = self.get_dataset_metadata(labels) + except Exception as e: + pass + else: + for rec in ds_meta: + # label is a dataset_key + names[rec["dataset_key"]] = rec["title"] + return names + # ---------------------------------------------------- - def _add_dataset_lookup_vals(self, records, rec_table, format): + def _add_dataset_names(self, records, rec_table, format): """Add dataset metadata to records. Args: - records: records to add dataset metadata to. + records: list of records (dicts for JSON format or lists for CSV format) + to add dataset names to. rec_table: dictionary of fieldnames, filename, format for a summary table format: output format, options "CSV" or "JSON" """ # Metadata table info meta_table = self._summary_tables["dataset_meta"] meta_key_fld = meta_table["key_fld"] - # Copy the list so we can remove an element before query - meta_fields_cpy = meta_table["fields"].copy() - meta_key_idx = meta_fields_cpy.index(meta_key_fld) - meta_fields_cpy.pop(meta_key_idx) - qry_flds = ", ".join(meta_fields_cpy) # Record info rec_fields = rec_table["fields"] - rec_key_fld = rec_table["key_fld"] - rec_key_idx = rec_fields.index(rec_key_fld) + rec_key_idx = rec_table["key_fld"] + if format == "CSV": + rec_key_idx = rec_fields.index(rec_key_idx) + dataset_keys = [rec[rec_key_idx] for rec in records] + + # Returns a list of dictionaries + meta_ds = self.get_dataset_metadata(dataset_keys) + + # Update each record for rec in records: - # Get datasetkey by field or position + # Initialize to empty string + dstitle = "" + # Iterate over metadata recs until we find dict with same key + for dsm in meta_ds: + if dsm[meta_key_fld] == rec[rec_key_idx]: + dstitle = dsm["title"] + break + # Update the record if format == "JSON": - val = rec[rec_key_fld] - else: - val = rec[rec_key_idx] - - query_str = ( - f"SELECT {qry_flds} FROM s3object s WHERE s.{meta_key_fld} = '{val}'" - ) - # Returns empty list or list of 1 record - meta_recs = self._query_summary_table(meta_table, query_str, format) - try: - meta = meta_recs[0] - except IndexError: - # Add placeholders for empty values, no entries for dictionary - if format == "CSV": - rec.extend(["" for _ in qry_flds]) + rec["dataset_title"] = dstitle else: - if format == "JSON": - rec.update(meta) - else: - rec.extend(meta) + rec.extend(dstitle) print(records) # ---------------------------------------------------- @@ -289,90 +299,35 @@ def rank_dataset_counts(self, rank_by, order, limit, format="JSON"): errors = {"error": [get_traceback()]} # Add dataset title, etc if the lookup table exists in S3 if self._dataset_metadata_exists(): - self._add_dataset_lookup_vals(records, table, format) + self._add_dataset_names(records, table, format) return records, errors - # # ---------------------------------------------------- - # def rank_species_counts(self, rank_by, order, limit, format="JSON"): - # """Return the top or bottom species ranked by number of occurrences or datasets. - # - # Args: - # rank_by: string indicating rank datasets by counts of "occurrence" or - # another data dimension (currently only "species"). - # order: string indicating whether to rank in "descending" or - # "ascending" order. - # limit: number of datasets to return, no more than 300. - # format: output format, options "CSV" or "JSON" - # - # Returns: - # records: list of limit records containing species_key, occ_count, - # dataset_count. - # """ - # records = [] - # errors = {} - # table = self._summary_tables['species_counts'] - # - # if rank_by == "occurrence": - # sort_field = "occ_count" - # else: - # sort_field = "dataset_count" - # try: - # records = self._query_order_summary_table( - # table, sort_field, order, limit, format) - # except Exception: - # errors = {"error": [get_traceback()]} - # # Add dataset title, etc if the lookup table exists in S3 - # if self._dataset_metadata_exists(): - # self._add_dataset_lookup_vals(records, table, format) - # - # return records, errors - - # # ---------------------------------------------------- - # def get_org_counts(self, pub_org_key): - # """Query S3 for occurrence and species counts for this organization. - # - # Args: - # pub_org_key: unique GBIF identifier for organization of interest. - # - # Returns: - # records: empty list or list of 1 record containing occ_count, species_count - # - # TODO: implement this? - # """ - # (occ_count, species_count) = (0,0) - # return (occ_count, species_count) - # ---------------------------------------------------- - def get_dataset_lookup_vals(self, dataset_keys): + def get_dataset_metadata(self, dataset_keys): """Return dataset metadata for a list of dataset_keys. Args: dataset_keys: list of dataset GUIDs to return names for. Returns: - recs (list): list of dictionaries with metadata for each dataset_key. + meta_recs (list): list of dictionaries with metadata for each dataset_key. """ # Metadata table info meta_table = self._summary_tables["dataset_meta"] meta_key_fld = meta_table["key_fld"] - # Copy the list so we can remove an element before query - meta_fields_cpy = meta_table["fields"].copy() - meta_key_idx = meta_fields_cpy.index(meta_key_fld) - meta_fields_cpy.pop(meta_key_idx) - qry_flds = ", ".join(meta_fields_cpy) - - for dataset_key in dataset_keys: - query_str = ( - f"SELECT {qry_flds} FROM s3object s WHERE s.{meta_key_fld} = '{dataset_key}'" - ) - # Returns empty list or list of 1 record - meta_recs = self._query_summary_table(meta_table, query_str, format) - try: - meta = meta_recs[0] - except IndexError: - meta = {} - return meta + + if not(isinstance(dataset_keys, list) or isinstance(dataset_keys, tuple)): + dataset_keys = [dataset_keys] + innerstr = "','".join(dataset_keys) + in_cond = f"['{innerstr}']" + query_str = ( + f"SELECT * FROM s3object s WHERE s.{meta_key_fld} IN {in_cond}" + ) + format = "JSON" + # Returns list of 0 or more records + meta_recs = self._query_summary_table(meta_table, query_str, format) + return meta_recs # # ---------------------------------------------------- # def rank_species_counts(self, order, limit, format="JSON"): @@ -398,7 +353,7 @@ def get_dataset_lookup_vals(self, dataset_keys): # errors = {"error": [get_traceback()]} # # if self._dataset_metadata_exists(): - # self._add_dataset_lookup_vals(records, format) + # self._add_dataset_names(records, format) # return records, errors