merged with develop

OHDSI · Dec 6, 2023 · 1170c83 · 1170c83
2 parents 5836121 + 6207324
commit 1170c83
Show file tree

Hide file tree

Showing 14 changed files with 628 additions and 434 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 2.0.1.9999
+Version: 2.0.2.9999
 Date: 18-04-2023
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "[email protected]", role = c("aut", "cre")),
@@ -35,7 +35,8 @@ Suggests:
     testthat,
     PRROC,
     ResultModelManager (>= 0.2.0),
-    DatabaseConnector (>= 6.0.0)
+    DatabaseConnector (>= 6.0.0),
+    Andromeda
 Remotes:
     ohdsi/PatientLevelPrediction,
     ohdsi/FeatureExtraction,

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+DeepPatientLevelPrediction 2.0.2
+======================
+  - Ensure output from predict_proba is numeric instead of 1d array
+  - Refactoring: Move cross-validation to a separate function
+  - Refactoring: Move paramsToTune to a separate function 
+  - linting: Enforcing HADES style
+  - Calculate AUC ourselves with torch, get rid of scikit-learn dependancy
+  - added Andromeda to dev dependencies
+
+
 DeepPatientLevelPrediction 2.0.1
 ======================
   - Connection parameter fixed to be in line with newest polars

diff --git a/R/Estimator.R b/R/Estimator.R
@@ -283,7 +283,7 @@ predictDeepEstimator <- function(plpModel,
   } else {
     prediction$value <- plpModel$model$predict_proba(data)
   }
-
+  prediction$value <- as.numeric(prediction$value)
   attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType")
 
   return(prediction)
@@ -344,9 +344,9 @@ gridCvDeep <- function(mappedData,
                               fitParams,
                               paramSearch[[gridId]])
       currentEstimatorSettings$modelType <- modelSettings$modelType
-      currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
+      currentModelParams$catFeatures <- dataset$get_cat_features()$max()
       currentModelParams$numFeatures <-
-        dataset$get_numerical_features()$shape[[1]]
+        dataset$get_numerical_features()$max()
       if (findLR) {
         lrFinder <- createLRFinder(modelType = modelSettings$modelType,
           modelParameters = currentModelParams,
@@ -417,8 +417,8 @@ gridCvDeep <- function(mappedData,
     dir.create(file.path(modelLocation), recursive = TRUE)
   }
 
-  modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
-  modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
+  modelParams$catFeatures <- dataset$get_cat_features()$max()
+  modelParams$numFeatures <- dataset$get_numerical_features()$max()
 
 
   estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings,

diff --git a/R/LRFinder.R b/R/LRFinder.R
@@ -23,14 +23,17 @@ createLRFinder <- function(modelType,
   lrFinderClass <-
     reticulate::import_from_path("LrFinder", path = path)$LrFinder
 
-
-
   estimatorSettings <- evalEstimatorSettings(estimatorSettings)
 
+  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
+  modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
+  estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
+  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
   browser()
   estimator <- createEstimator(modelType = estimatorSettings$modelType,
                                modelParameters = modelParameters,
                                estimatorSettings = estimatorSettings)
+
   if (!is.null(lrSettings)) {
     lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
   }

diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
@@ -72,7 +72,7 @@ trainingCache <- R6::R6Class(
       return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions,
                                function(x) !is.null(x$gridPerformance)))))
     },
-
+    
     #' @description
     #' Gets the last index from the cached grid search
     #' @returns Last grid search index

diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
@@ -45,7 +45,7 @@ def __init__(self, data, labels=None, numerical_features=None):
         data_cat = (
             data.filter(~pl.col("columnId").is_in(self.numerical_features))
             .select(pl.col("rowId"), pl.col("columnId"))
-            .sort("rowId")
+            .sort(["rowId", "columnId"])
             .with_columns(pl.col("rowId") - 1)
             .collect()
         )
@@ -69,9 +69,13 @@ def __init__(self, data, labels=None, numerical_features=None):
         if pl.count(self.numerical_features) == 0:
             self.num = None
         else:
+            map_numerical = dict(zip(self.numerical_features.sort().to_list(),
+                                     list(range(len(self.numerical_features)))))
+
             numerical_data = (
                 data.filter(pl.col("columnId").is_in(self.numerical_features))
-                .sort(by="columnId")
+                .with_columns(pl.col("columnId").replace(map_numerical),
+                              pl.col("rowId") - 1)
                 .select(
                     pl.col("rowId"),
                     pl.col("columnId"),

diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
@@ -7,13 +7,13 @@ connectionDetails <- Eunomia::getEunomiaConnectionDetails()
 Eunomia::createCohorts(connectionDetails)
 
 covSet <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T,
-  useDemographicsAge = T,
-  useDemographicsRace = T,
-  useDemographicsEthnicity = T,
-  useDemographicsAgeGroup = T,
-  useConditionGroupEraLongTerm = T,
-  useDrugEraStartLongTerm = T,
+  useDemographicsGender = TRUE,
+  useDemographicsAge = TRUE,
+  useDemographicsRace = TRUE,
+  useDemographicsEthnicity = TRUE,
+  useDemographicsAgeGroup = TRUE,
+  useConditionGroupEraLongTerm = TRUE,
+  useDrugEraStartLongTerm = TRUE,
   endDays = -1
 )
 
@@ -30,20 +30,36 @@ databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
   cdmDatabaseName = "eunomia"
 )
 
-restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings(
-  firstExposureOnly = T,
-  washoutPeriod = 365
-)
+restrictPlpDataSettings <-
+  PatientLevelPrediction::createRestrictPlpDataSettings(
+    firstExposureOnly = TRUE,
+    washoutPeriod = 365
+  )
 
 plpData <- PatientLevelPrediction::getPlpData(
   databaseDetails = databaseDetails,
   restrictPlpDataSettings = restrictPlpDataSettings,
   covariateSettings = covSet
 )
 
+# add age squared so I have more than one numerical feature
+plpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>%
+  dplyr::rows_append(data.frame(
+                                covariateId = 2002,
+                                covariateName = "Squared age",
+                                analysisId = 2,
+                                conceptId = 0), copy = TRUE)
+
+squaredAges <- plpData$covariateData$covariates %>%
+  dplyr::filter(covariateId == 1002) %>%
+  dplyr::mutate(covariateId = 2002,
+                covariateValue = .data$covariateValue**2)
+
+plpData$covariateData$covariates <- plpData$covariateData$covariates %>%
+  dplyr::rows_append(squaredAges)
 
 populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
-  requireTimeAtRisk = F,
+  requireTimeAtRisk = FALSE,
   riskWindowStart = 1,
   riskWindowEnd = 365
 )
@@ -66,33 +82,32 @@ mappedData <- PatientLevelPrediction::MapIds(
 )
 
 path <- system.file("python", package = "DeepPatientLevelPrediction")
-Dataset <- reticulate::import_from_path("Dataset", path = path)
+datasetClass <- reticulate::import_from_path("Dataset", path = path)
 if (is.null(attributes(mappedData)$path)) {
   # sqlite object
   attributes(mappedData)$path <- attributes(mappedData)$dbname
 }
 
-dataset <- Dataset$Data(
+dataset <- datasetClass$Data(
   data = reticulate::r_to_py(normalizePath(attributes(mappedData)$path)),
   labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
 )
-small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))
+smallDataset <- torch$utils$data$Subset(dataset,
+                                        (1:round(length(dataset) / 3)))
 
 modelSettings <- setResNet(
   numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
   residualDropout = c(0, 0.2), hiddenDropout = 0,
   sizeEmbedding = 16, hyperParamSearch = "random",
   randomSample = 2,
-  setEstimator(epochs=1,
+  setEstimator(epochs = 1,
                learningRate = 3e-4)
 )
-fitEstimatorPath <- file.path(testLoc, 'fitEstimator')
+fitEstimatorPath <- file.path(testLoc, "fitEstimator")
 if (!dir.exists(fitEstimatorPath)) {
   dir.create(fitEstimatorPath)
 }
-fitEstimatorResults <- fitEstimator(trainData$Train, 
-                                    modelSettings = modelSettings, 
-                                    analysisId = 1, 
-                                    analysisPath = fitEstimatorPath)
-
-
+fitEstimatorResults <- fitEstimator(trainData$Train,
+                                    modelSettings = modelSettings,
+                                    analysisId = 1,
+                                    analysisPath = fitEstimatorPath)
diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R
@@ -1,7 +1,9 @@
 test_that("number of num and cat features sum correctly", {
   testthat::expect_equal(
-    length(dataset$get_numerical_features()) + length(dataset$get_cat_features()),
-    dplyr::n_distinct(mappedData$covariates %>% dplyr::collect() %>%
+    length(dataset$get_numerical_features()) +
+      length(dataset$get_cat_features()),
+    dplyr::n_distinct(mappedData$covariates %>%
+                        dplyr::collect() %>%
                         dplyr::pull(covariateId))
   )
 })
@@ -12,46 +14,112 @@ test_that("length of dataset correct", {
   expect_equal(length(dataset), dataset$num$shape[0])
   expect_equal(
     length(dataset),
-    dplyr::n_distinct(mappedData$covariates %>% 
-                        dplyr::collect() %>% dplyr::pull(rowId))
+    dplyr::n_distinct(mappedData$covariates %>%
+                        dplyr::collect() %>%
+                        dplyr::pull(.data$rowId))
   )
 })
 
 test_that(".getbatch works", {
-  batch_size <- 16
+  batchSize <- 16
   # get one sample
   out <- dataset[10]
-
-  # output should be a list of two items, the batch in pos 1 and targets in pos 2,
+
+  # output should be a list of two items,
+  # the batch in pos 1 and targets in pos 2,
   # the batch is what goes to the model
   expect_equal(length(out), 2)
-  
+
   # targets should be binary
   expect_true(out[[2]]$item() %in% c(0, 1))
-  
+
   # shape of batch is correct
   expect_equal(length(out[[1]]), 2)
   expect_equal(out[[1]]$cat$shape[0], 1)
   expect_equal(out[[1]]$num$shape[0], 1)
-  
+
   # shape of target
   expect_equal(out[[2]]$shape$numel(), 1)
-  
+
   # get a whole batch
-  out <- dataset[10:(10 + batch_size)]
-  
+  out <- dataset[10:(10 + batchSize)]
+
   expect_equal(length(out), 2)
   expect_true(all(out[[2]]$numpy() %in% c(0, 1)))
-  
+
   expect_equal(length(out[[1]]), 2)
   expect_equal(out[[1]]$cat$shape[0], 16)
   expect_equal(out[[1]]$num$shape[0], 16)
-  
+
   expect_equal(out[[2]]$shape[0], 16)
 })
 
-test_that("Column order is preserved in presence of missing features", {
-  # important for both external validation and transfer learning
-
-
+test_that("Column order is preserved when features are missing", {
+  # important for transfer learning and external validation
+
+  reducedCovData <- Andromeda::copyAndromeda(trainData$Train$covariateData)
+
+  # remove one numerical and one categorical
+  numFeature <- 1002 # continous age
+  catFeature <- 4285898210 # a random common cat feature
+  reducedCovData$covariates <- trainData$Train$covariateData$covariates %>%
+    dplyr::filter(!(covariateId %in% c(numFeature, catFeature)))
+  reducedCovData$covariates <- trainData$Train$covariateData$covariates %>%
+    dplyr::filter(!(covariateId %in% c(numFeature, catFeature)))
+
+  mappedReducedData <- PatientLevelPrediction::MapIds(
+    reducedCovData,
+    mapping = mappedData$mapping
+  )
+
+  catColumn <- mappedData$mapping %>%
+    dplyr::filter(covariateId == catFeature) %>%
+    dplyr::pull("columnId")
+  numColumn <- mappedData$mapping %>%
+    dplyr::filter(covariateId == numFeature) %>%
+    dplyr::pull("columnId")
+
+  reducedDataset <- datasetClass$Data(
+    data =
+      reticulate::r_to_py(normalizePath(attributes(mappedReducedData)$dbname)),
+    labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
+    numerical_features = dataset$numerical_features$to_list()
+  )
+
+  # should have same number of columns
+  expect_equal(dataset$num$shape[[1]], reducedDataset$num$shape[[1]])
+
+  # all zeros in column with removed feature, -1 because r to py
+  expect_true(reducedDataset$num[, numColumn - 1]$sum()$item() == 0)
+
+  # all other columns are same
+  indexReduced <- !torch$isin(torch$arange(reducedDataset$num$shape[[1]]),
+                              numColumn - 1)
+  index <- !torch$isin(torch$arange(dataset$num$shape[[1]]),
+                       numColumn - 1)
+
+  expect_equal(reducedDataset$num[, indexReduced]$mean()$item(),
+               dataset$num[, index]$mean()$item())
+
+  # cat data should have same counts of all columnIds
+  # expect the one that was removed
+  # not same counts for removed feature
+  expect_false(isTRUE(all.equal((reducedDataset$cat == catColumn)$sum()$item(),
+                                (dataset$cat == catColumn)$sum()$item())))
+
+  # get counts
+  counts <- as.array(torch$unique(dataset$cat,
+                                  return_counts = TRUE)[[2]]$numpy())
+  counts <- counts[-(catColumn + 1)] # +1 because py_to_r
+  counts <- counts[-1]
+
+  reducedCounts <- as.array(torch$unique(reducedDataset$cat,
+                                         return_counts = TRUE)[[2]]$numpy())
+  reducedCounts <- reducedCounts[-(catColumn + 1)] # +1 because py_to_r
+  reducedCounts <- reducedCounts[-1]
+
+  expect_false(isTRUE(all.equal(counts, reducedCounts)))
+  expect_equal(dataset$get_cat_features()$max(),
+               reducedDataset$get_cat_features()$max())
+
 })